#!/usr/bin/env python
# Andre Anjos <andre.anjos@idiap.ch>
# Thu 23 Jun 20:22:28 2011 CEST
# vim: set fileencoding=utf-8 :
"""
The Iris flower data set or Fisher's Iris data set is a multivariate data
set introduced by Sir Ronald Aylmer Fisher (1936) as an example of
discriminant analysis. It is sometimes called Anderson's Iris data set
because Edgar Anderson collected the data to quantify the geographic
variation of Iris flowers in the Gaspe Peninsula.
For more information: http://en.wikipedia.org/wiki/Iris_flower_data_set
References:
1. Fisher,R.A. "The use of multiple measurements in taxonomic problems",
Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
Mathematical Statistics" (John Wiley, NY, 1950).
2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
(Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
Structure and Classification Rule for Recognition in Partially Exposed
Environments". IEEE Transactions on Pattern Analysis and Machine
Intelligence, Vol. PAMI-2, No. 1, 67-71.
4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
Transactions on Information Theory, May 1972, 431-433.
"""
import os
import sys
import numpy
from . import driver #driver interface
import pkg_resources
__version__ = pkg_resources.require(__name__)[0].version
names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
"""Names of the features for each entry in the dataset."""
stats = {
'Sepal Length': [4.3, 7.9, 5.84, 0.83, 0.7826],
'Sepal Width': [2.0, 4.4, 3.05, 0.43, -0.4194],
'Petal Length': [1.0, 6.9, 3.76, 1.76, 0.9490], #high correlation
'Petal Width': [0.1, 2.5, 1.20, 0.76, 0.9565], #high correlation
}
"""These are basic statistics for each of the features in the whole dataset."""
stat_names = ['Minimum', 'Maximum', 'Mean', 'Std.Dev.', 'Correlation']
"""These are the statistics available in each column of the stats variable."""
[docs]def data():
"""Loads from (text) file and returns Fisher's Iris Dataset.
This set is small and simple enough to require an SQL backend. We keep the
single file it has in text and load it on-the-fly every time this method is
called.
We return a dictionary containing the 3 classes of Iris plants catalogued in
this dataset. Each dictionary entry contains an 2D :py:class:`numpy.ndarray`
of 64-bit floats and 50 entries. Each entry is an Array with 4 features as
described by "names".
"""
from .driver import Interface
import csv
import pkg_resources
data = pkg_resources.resource_filename(__name__, 'iris.data')
retval = {}
# The CSV file reader API changed between Python2 and Python3
open_dict = dict(mode='rb') #python2.x
if sys.version_info[0] >= 3: #python3.x
open_dict = dict(mode='rt', encoding='ascii', newline='')
with open(data, **open_dict) as csvfile:
for row in csv.reader(csvfile):
name = row[4][5:].lower()
retval.setdefault(name, []).append([float(k) for k in row[:4]])
# Convert to a float64 2D numpy.ndarray
for key, value in retval.items():
retval[key] = numpy.array(value, dtype='float64')
return retval
def __dump__(args):
"""Dumps the database to stdout.
Keyword arguments:
args
A argparse.Arguments object with options set. We use two of the options:
``cls`` for the class to be dumped (if None, then dump all data) and
``selftest``, which runs the internal test.
"""
d = data()
if args.cls: d = {args.cls: d[args.cls]}
output = sys.stdout
if args.selftest:
from bob.db.base.utils import null
output = null()
for k, v in d.items():
for array in v:
s = ','.join(['%.1f' % array[i] for i in range(array.shape[0])] + [k])
output.write('%s\n' % (s,))
return 0
[docs]def get_config():
"""Returns a string containing the configuration information.
"""
import bob.extension
return bob.extension.get_config(__name__)
__all__ = ['names', 'stats', 'stat_names', 'data', 'get_config']