1#!/usr/bin/env python
2# coding=utf-8
3
4"""NIH CXR14 (relabeled) dataset for computer-aided diagnosis
5
6This dataset was extracted from the clinical PACS database at the National
7Institutes of Health Clinical Center (USA) and represents 60% of all
8their radiographs. It contains labels for fourteen common radiological
9signs in this order: cardiomegaly, emphysema, effusion, hernia, infiltration,
10mass, nodule, atelectasis, pneumothorax, pleural thickening, pneumonia,
11fibrosis, edema and consolidation.
12This is the relabeled version created in the CheXNeXt study.
13
14* Reference: [NIH-CXR14-2017]_
15* Original resolution (height x width or width x height): 1024 x 1024
16* Labels: [CHEXNEXT-2018]_
17* Split reference: [CHEXNEXT-2018]_
18* Protocol ``default``:
19
20 * Training samples: 98'637 (including labels)
21 * Validation samples: 6'350 (including labels)
22 * Test samples: 0
23
24* Protocol `ìdiap``:
25 * Images path adapted to Idiap infrastructure
26
27"""
28
29import os
30import pkg_resources
31
32import bob.extension
33
34from ..dataset import JSONDataset
35from ..loader import load_pil_rgb, make_delayed
36
37_protocols = [
38 pkg_resources.resource_filename(__name__, "default.json"),
39 pkg_resources.resource_filename(__name__, "idiap.json"),
40 pkg_resources.resource_filename(__name__, "cardiomegaly_idiap.json"),
41]
42
43
44def _raw_data_loader(sample):
45 return dict(
46 data=load_pil_rgb(
47 os.path.join(
48 bob.extension.rc.get(
49 "bob.med.tb.nih_cxr14_re.datadir", os.path.realpath(os.curdir)
50 ),
51 sample["data"],
52 )
53 ),
54 label=sample["label"],
55 )
56
57
58def _loader(context, sample):
59 # "context" is ignored in this case - database is homogeneous
60 # we returned delayed samples to avoid loading all images at once
61 return make_delayed(sample, _raw_data_loader)
62
63
64dataset = JSONDataset(
65 protocols=_protocols, fieldnames=("data", "label"), loader=_loader,
66)
67"""NIH CXR14 (relabeled) dataset object"""