"""Generate random scores.
"""
import csv
import logging
import os
import click
import numpy
from bob.extension.scripts.click_helper import verbosity_option
from bob.io.base import create_directories_safe
logger = logging.getLogger(__name__)
[docs]def gen_score_distr(
mean_neg,
mean_pos,
sigma_neg=10,
sigma_pos=10,
n_neg=5000,
n_pos=5000,
seed=0,
):
"""Generate scores from normal distributions
Parameters
----------
mean_neg : float
Mean for negative scores
mean_pos : float
Mean for positive scores
sigma_neg : float
STDev for negative scores
sigma_pos : float
STDev for positive scores
n_pos: int
The number of positive scores generated
n_neg: int
The number of negative scores generated
seed: int
A value to initialize the Random Number generator. Giving the same
value (or not specifying 'seed') on two different calls will generate
the same lists of scores.
Returns
-------
neg_scores : :any:`list`
Negatives scores
pos_scores : :any:`list`
Positive scores
"""
logger.debug("Initializing RNG.")
numpy.random.seed(seed)
logger.info(f"Generating {n_neg} negative and {n_pos} positive scores.")
neg_scores = numpy.random.normal(loc=mean_neg, scale=sigma_neg, size=n_neg)
pos_scores = numpy.random.normal(loc=mean_pos, scale=sigma_pos, size=n_pos)
return neg_scores, pos_scores
[docs]def write_scores_to_file(
neg,
pos,
filename,
n_subjects=5,
n_probes_per_subject=5,
n_unknown_subjects=0,
neg_unknown=None,
to_csv=True,
five_col=False,
metadata={"meta0": "data0", "meta1": "data1"},
):
"""Writes score distributions
Parameters
----------
neg : :py:class:`numpy.ndarray`
Scores for negative samples.
pos : :py:class:`numpy.ndarray`
Scores for positive samples.
filename : str
The path to write the score to.
n_subjects: int
Number of different subjects
n_probes_per_subject: int
Number of different samples used as probe for each subject
n_unknown_subjects: int
The number of unknown (no registered model) subjects
neg_unknown: None or list
The of unknown subjects scores
to_csv: bool
Use the CSV format, else the legacy 4 or 5 columns format.
five_col : bool
If 5-colum format, else 4-column
"""
logger.debug(f"Creating result directories ('{filename}').")
create_directories_safe(os.path.dirname(filename))
s_subjects = ["x%d" % i for i in range(n_subjects)]
logger.debug("Writing scores to files.")
with open(filename, "wt") as f:
if to_csv:
csv_writer = csv.writer(f)
csv_writer.writerow(
["bio_ref_subject_id", "probe_subject_id", "key", "score"]
+ list(metadata.keys())
)
# Generate one line per probe (unless "--force-count" specified)
logger.debug("Writing positive scores.")
for i, score in enumerate(pos):
s_name = s_subjects[int(i / n_probes_per_subject) % n_subjects]
s_five = " " if not five_col else " d" + s_name + " "
probe_id = "%s_%d" % (s_name, i % n_probes_per_subject)
if to_csv:
csv_writer.writerow(
[s_name, s_name, probe_id, score] + list(metadata.values())
)
else:
f.write(
"%s%s%s %s %f\n" % (s_name, s_five, s_name, probe_id, score)
)
# Generate one line per probe against each ref (unless "--force-count" specified)
logger.debug("Writing negative scores.")
for i, score in enumerate(neg):
n_impostors = n_subjects - 1
ref = s_subjects[
int(i / n_probes_per_subject / n_impostors) % n_subjects
]
impostors = [s for s in s_subjects if s != ref] # ignore pos
probe = impostors[int(i / n_probes_per_subject) % n_impostors]
s_five = " " if not five_col else " d" + ref
probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
if to_csv:
csv_writer.writerow(
[ref, probe, probe_id, score] + list(metadata.values())
)
else:
f.write(
"%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)
)
logger.debug("Writing unknown scores.")
if neg_unknown is not None:
s_unknown_subjects = ["u%d" % i for i in range(n_unknown_subjects)]
for i, score in enumerate(neg_unknown):
ref = s_subjects[
int(i / n_probes_per_subject / n_unknown_subjects)
% n_subjects
]
probe = s_unknown_subjects[
int(i / n_probes_per_subject) % n_unknown_subjects
]
s_five = " " if not five_col else " d" + ref + " "
probe_id = "%s_%d" % (probe, i % n_probes_per_subject)
if to_csv:
csv_writer.writerow(
[ref, probe, probe_id, score] + list(metadata.values())
)
else:
f.write(
"%s%s%s %s %f\n" % (ref, s_five, probe, probe_id, score)
)
@click.command(
epilog="""
Scores generation examples:
Output 'scores-dev.csv' and 'scores-eval.csv' in a new folder 'generated_scores/':
$ bob bio gen ./generated_scores
Output scores similar to a system evaluated on the AT&T dataset dev group:
$ bob bio gen -s 20 -p 5 ./generated_scores
Output a given number of scores in each file:
$ bob bio gen -f --n-neg 500 --n-pos 100 ./generated_scores
Include unknown subjects scores:
$ bob bio gen -s 5 -u 2 ./generated_scores
Change the mean and standard deviation of the scores distributions:
$ bob bio gen -mm 1 -sp 0.3 -mnm -1 -sn 0.5 ./generated_scores
You can observe the distributions histograms in a pdf file with:
$ bob bio hist -e ./generated_scores/scores-{dev,eval}.csv -o hist_gen.pdf
"""
)
@click.argument("outdir")
@click.option(
"-mm",
"--mean-match",
default=10,
type=click.FLOAT,
show_default=True,
help="Mean for the positive scores distribution",
)
@click.option(
"-mnm",
"--mean-non-match",
default=-10,
type=click.FLOAT,
show_default=True,
help="Mean for the negative scores distribution",
)
@click.option(
"-p",
"--n-probes-per-subject",
default=5,
type=click.INT,
show_default=True,
help="Number of probes per subject",
)
@click.option(
"-s",
"--n-subjects",
default=50,
type=click.INT,
show_default=True,
help="Number of subjects",
)
@click.option(
"-sp",
"--sigma-positive",
default=10,
type=click.FLOAT,
show_default=True,
help="Variance for the positive score distributions",
)
@click.option(
"-sn",
"--sigma-negative",
default=10,
type=click.FLOAT,
show_default=True,
help="Variance for the negative score distributions",
)
@click.option(
"-u",
"--n-unknown-subjects",
default=0,
type=click.INT,
show_default=True,
help="Number of unknown subjects (useful for open-set plots)",
)
@click.option(
"-f",
"--force-count",
"force_count",
is_flag=True,
help="Use --n-pos and --n-neg amounts instead of the subject and sample counts",
)
@click.option(
"--n-pos",
"n_pos",
default=5000,
type=click.INT,
show_default=True,
help="Number of Positive verifications (number of lines in the file)",
)
@click.option(
"--n-neg",
"n_neg",
default=5000,
type=click.INT,
show_default=True,
help="Number of Negative verifications (number of lines in the file)",
)
@click.option(
"--n-unk",
"n_unk",
default=5000,
type=click.INT,
show_default=True,
help="Number of Unknown verifications (number of lines in the file)",
)
@click.option("--csv/--legacy", default=True, show_default=True)
@click.option("--five-col/--four-col", default=False, show_default=True)
@verbosity_option()
def gen(
outdir,
mean_match,
mean_non_match,
n_probes_per_subject,
n_subjects,
sigma_positive,
sigma_negative,
n_unknown_subjects,
csv,
five_col,
force_count,
n_pos,
n_neg,
n_unk,
**kwargs,
):
"""Generate random scores.
Generates random scores in 4col or 5col format. The scores are generated
using Gaussian distribution whose mean and variance are an input
parameter. The generated scores can be used as hypothetical datasets.
This command generates scores relative to the number of subjects and
probes per subjects, unless the -f flag is set. In that case, the --n-pos
and --n-neg options are used as number of genuine and impostor
comparisons.
"""
# Compute the number of verifications needed
if force_count:
neg_count, pos_count, unknown_count = n_neg, n_pos, n_unk
else:
# One reference (model), and `n_probes_per_subject` probes per subject
neg_count = n_subjects * n_probes_per_subject * (n_subjects - 1)
pos_count = n_probes_per_subject * n_subjects
unknown_count = n_unknown_subjects * n_subjects * n_probes_per_subject
# Generate the data
logger.info("Generating dev scores.")
neg_dev, pos_dev = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=neg_count,
n_pos=pos_count,
seed=0,
)
logger.info("Generating eval scores.")
neg_eval, pos_eval = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=neg_count,
n_pos=pos_count,
seed=1,
)
# For simplicity I will use the same distribution for dev-eval
if n_unknown_subjects:
logger.info("Generating unknown scores.")
neg_unknown, _ = gen_score_distr(
mean_non_match,
mean_match,
sigma_negative,
sigma_positive,
n_neg=unknown_count,
n_pos=0,
seed=2,
)
else:
neg_unknown = None
# Write the data into files
logger.info("Saving results.")
write_scores_to_file(
neg_dev,
pos_dev,
os.path.join(outdir, "scores-dev.csv"),
n_subjects,
n_probes_per_subject,
n_unknown_subjects,
neg_unknown,
csv,
five_col,
)
write_scores_to_file(
neg_eval,
pos_eval,
os.path.join(outdir, "scores-eval.csv"),
n_subjects,
n_probes_per_subject,
n_unknown_subjects,
neg_unknown,
csv,
five_col,
)