Coverage for /scratch/builds/bob/bob.ip.binseg/miniconda/conda-bld/bob.ip.binseg_1635977648782/_test_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_p/lib/python3.8/site-packages/bob/ip/binseg/script/significance.py : 27%

347 $ bob binseg significance -vv drive --names system1 system2 --predictions=path/to/predictions/system-1 path/to/predictions/system-2 --threshold=train --evaluate=alternate-test

348""",

349)

350@click.option(

351 "--names",

352 "-n",

353 help="Names of the two systems to compare",

354 nargs=2,

355 required=True,

356 type=str,

357 cls=ResourceOption,

358)

359@click.option(

360 "--predictions",

361 "-p",

362 help="Path where predictions of system 2 are currently stored. You may "

363 "also input predictions from a second-annotator. This application "

364 "will adequately handle it.",

365 nargs=2,

366 required=True,

367 type=click.Path(exists=True, file_okay=False, dir_okay=True),

368 cls=ResourceOption,

369)

370@click.option(

371 "--dataset",

372 "-d",

373 help="A dictionary mapping string keys to "

374 "torch.utils.data.dataset.Dataset instances",

375 required=True,

376 cls=ResourceOption,

377)

378@click.option(

379 "--threshold",

380 "-t",

381 help="This number is used to define positives and negatives from "

382 "probability maps, and report F1-scores (a priori). By default, we "

383 "expect a set named 'validation' to be available at the input data. "

384 "If that is not the case, we use 'train', if available. You may provide "

385 "the name of another dataset to be used for threshold tunning otherwise. "

386 "If not set, or a string is input, threshold tunning is done per system, "

387 "individually. Optionally, you may also provide a floating-point number "

388 "between [0.0, 1.0] as the threshold to use for both systems.",

389 default="validation",

390 show_default=True,

391 required=True,

392 cls=ResourceOption,

393)

394@click.option(

395 "--evaluate",

396 "-e",

397 help="Name of the dataset to evaluate",

398 default="test",

399 show_default=True,

400 required=True,

401 cls=ResourceOption,

402)

403@click.option(

404 "--steps",

405 "-S",

406 help="This number is used to define the number of threshold steps to "

407 "consider when evaluating the highest possible F1-score on train/test data.",

408 default=1000,

409 type=int,

410 show_default=True,

411 required=True,

412 cls=ResourceOption,

413)

414@click.option(

415 "--size",

416 "-s",

417 help="This is a tuple with two values indicating the size of windows to "

418 "be used for sliding window analysis. The values represent height and "

419 "width respectively.",

420 default=(128, 128),

421 nargs=2,

422 type=int,

423 show_default=True,

424 required=True,

425 cls=ResourceOption,

426)

427@click.option(

428 "--stride",

429 "-t",

430 help="This is a tuple with two values indicating the stride of windows to "

431 "be used for sliding window analysis. The values represent height and "

432 "width respectively.",

433 default=(32, 32),

434 nargs=2,

435 type=int,

436 show_default=True,

437 required=True,

438 cls=ResourceOption,

439)

440@click.option(

441 "--figure",

442 "-f",

443 help="The name of a performance figure (e.g. f1_score, or jaccard) to "

444 "use when comparing performances",

445 default="accuracy",

446 type=str,

447 show_default=True,

448 required=True,

449 cls=ResourceOption,

450)

451@click.option(

452 "--output-folder",

453 "-o",

454 help="Path where to store visualizations",

455 required=False,

456 type=click.Path(),

457 show_default=True,

458 cls=ResourceOption,

459)

460@click.option(

461 "--remove-outliers/--no-remove-outliers",

462 "-R",

463 help="If set, removes outliers from both score distributions before "

464 "running statistical analysis. Outlier removal follows a 1.5 IQR range "

465 "check from the difference in figures between both systems and assumes "

466 "most of the distribution is contained within that range (like in a "

467 "normal distribution)",

468 default=False,

469 required=True,

470 show_default=True,

471 cls=ResourceOption,

472)

473@click.option(

474 "--remove-zeros/--no-remove-zeros",

475 "-R",

476 help="If set, removes instances from the statistical analysis in which "

477 "both systems had a performance equal to zero.",

478 default=False,

479 required=True,

480 show_default=True,

481 cls=ResourceOption,

482)

483@click.option(

484 "--parallel",

485 "-x",

486 help="Set the number of parallel processes to use when running using "

487 "multiprocessing. A value of zero uses all reported cores.",

488 default=1,

489 type=int,

490 show_default=True,

491 required=True,

492 cls=ResourceOption,

493)

494@click.option(

495 "--checkpoint-folder",

496 "-k",

497 help="Path where to store checkpointed versions of sliding window "

498 "performances",

499 required=False,

500 type=click.Path(),

501 show_default=True,

502 cls=ResourceOption,

503)

504@verbosity_option(cls=ResourceOption)

505def significance(

506 names,

507 predictions,

508 dataset,

509 threshold,

510 evaluate,

511 steps,

512 size,

513 stride,

514 figure,

515 output_folder,

516 remove_outliers,

517 remove_zeros,

518 parallel,

519 checkpoint_folder,

520 **kwargs,

521):

522 """Evaluates how significantly different are two models on the same dataset

523

524 This application calculates the significance of results of two models

525 operating on the same dataset, and subject to a priori threshold tunning.

526 """

527

528 # minimal validation to startup

529 threshold = _validate_threshold(threshold, dataset)

530 assert evaluate in dataset, f"No dataset named '{evaluate}'"

531

532 perf1 = _eval_sliding_windows(

533 names[0],

534 threshold,

535 evaluate,

536 predictions[0],

537 dataset,

538 steps,

539 size,

540 stride,

541 (

542 output_folder

543 if output_folder is None

544 else os.path.join(output_folder, names[0])

545 ),

546 figure,

547 parallel,

548 checkpoint_folder,

549 )

550

551 perf2 = _eval_sliding_windows(

552 names[1],

553 threshold,

554 evaluate,

555 predictions[1],

556 dataset,

557 steps,

558 size,

559 stride,

560 (

561 output_folder

562 if output_folder is None

563 else os.path.join(output_folder, names[1])

564 ),

565 figure,

566 parallel,

567 checkpoint_folder,

568 )

569

570 # perf_diff = _eval_differences(

571 # names,

572 # (perf1, perf2),

573 # evaluate,

574 # dataset,

575 # size,

576 # stride,

577 # (

578 # output_folder

579 # if output_folder is None

580 # else os.path.join(output_folder, "diff")

581 # ),

582 # figure,

583 # parallel,

584 # checkpoint_folder,

585 # )

586

587 # loads all figures for the given threshold

588 stems = list(perf1.keys())

589 figindex = PERFORMANCE_FIGURES.index(figure)

590 da = numpy.array([perf1[k]["winperf"][figindex] for k in stems]).flatten()

591 db = numpy.array([perf2[k]["winperf"][figindex] for k in stems]).flatten()

592 diff = da - db

593

594 while remove_outliers:

595 outliers_diff = index_of_outliers(diff)

596 if sum(outliers_diff) == 0:

597 break

598 diff = diff[~outliers_diff]

599 da = da[~outliers_diff]

600 db = db[~outliers_diff]

601

602 if remove_zeros:

603 remove_zeros = (da == 0) & (db == 0)

604 diff = diff[~remove_zeros]

605 da = da[~remove_zeros]

606 db = db[~remove_zeros]

607

608 if output_folder is not None:

609 fname = os.path.join(output_folder, "analysis.pdf")

610 os.makedirs(os.path.dirname(fname), exist_ok=True)

611 logger.info(f"Writing analysis figures to {fname} (multipage PDF)...")

612 write_analysis_figures(names, da, db, fname)

613

614 if output_folder is not None:

615 fname = os.path.join(output_folder, "analysis.txt")

616 os.makedirs(os.path.dirname(fname), exist_ok=True)

617 logger.info(f"Writing analysis summary to {fname}...")

618 with open(fname, "wt") as f:

619 write_analysis_text(names, da, db, f)

620 write_analysis_text(names, da, db, sys.stdout)

106 statements 29 run 77 missing 0 excluded

106 statements