"""Tables representing values with asymmetric uncertainties.
This module provides a class to handle CSV-like table data representing values
with asymmetric uncertainties. Such tables are provided in various format; for
example, the uncertainty may be relative or absolute, or with multiple sources.
The class :class:`BaseFile` interprets such tables based on `FileInfo`
annotations.
"""
from __future__ import absolute_import, division, print_function # py2
import itertools
import json
import logging
import pathlib # noqa: F401
import sys
from typing import ( # noqa: F401
Any,
Generic,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
Set,
TypeVar,
Union,
cast,
)
import pandas
import numpy
from susy_cross_section.base.info import FileInfo, UncSpecType, ValueInfo
from susy_cross_section.utility import Unit
if sys.version_info[0] < 3: # py2
str = basestring # noqa: A001, F821
JSONDecodeError = Exception
else:
JSONDecodeError = json.decoder.JSONDecodeError
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
PathLike = Union[pathlib.Path, str]
TableT = TypeVar("TableT", bound="BaseTable", covariant=True)
[docs]class BaseTable(object):
"""Table object with annotations.
This is a wrapper class of :class:`pandas.DataFrame`. Any methods except
for read/write of `!file` are delegated to the DataFrame object.
Attributes
----------
file: BaseFile, optional
File object containing this table.
name: str, optional
Name of this table.
This is provided so that `ValueInfo` can be obtained from `!file`.
"""
def __init__(self, obj=None, file=None, name=None):
# type:(pandas.DataFrame, Optional[BaseFile[BaseTable]], Optional[str])->None
if isinstance(obj, pandas.DataFrame):
self._df = obj # type: pandas.DataFrame
else:
self._df = pandas.DataFrame()
self.file = file # type: Optional[BaseFile[BaseTable]]
self.name = name # type: Optional[str]
[docs] def __getattr__(self, name):
# type: (str)->Any
"""Fall-back method to delegate any operations to the DataFrame."""
return self._df.__getattr__(name)
[docs] def __setitem__(self, name, obj):
# type: (str, Any)->Any
"""Perform DataFrame.__setitem__."""
return self._df.__setitem__(name, obj)
[docs] def __getitem__(self, name):
# type: (str)->Any
"""Perform DataFrame.__getitem__."""
return self._df.__getitem__(name)
[docs] def __str__(self):
# type: ()->str
"""Dump the data-frame."""
return cast(str, self._df.__str__())
[docs] def to_records(self):
# type: ()->numpy.record
"""Export the data-frame to a plain list."""
return self._df.to_records() # type: ignore
[docs]class BaseFile(Generic[TableT]):
"""File with table data-sets and annotations.
An instance has two main attributes: `!info` (:typ:`FileInfo`) as the
annotation and `!tables` (:typ:`dict` of :typ:`BaseTable`) as the data
tables.
Arguments
---------
table_path: str or pathlib.Path
Path to the csv data file.
info_path: str or pathlib.Path, optional
Path to the corresponding info file.
If unspecified, `!table_path` with suffix changed to ``".info"`` is
used.
Attributes
----------
table_path: pathlib.Path
Path to the csv data file.
info_path: pathlib.Path
Path to the info file.
raw_data: pandas.DataFrame
the content of `!table_path`.
info: FileInfo
the content of `!info_path`.
tables: dict(str, BaseTable)
The table parsed according to the annotation.
Each value is practically a `pandas.DataFrame` object and indexed
according to the parameter specified in `!info`, having exactly three
value-columns: ``"value"``, ``"unc+"``, and ``"unc-"`` for the central
value and positive- and negative- directed **absolute** uncertainty,
respectively. The content of ``"unc-"`` is non-positive.
"""
def __init__(self, table_path, info_path=None):
# type: (Union[PathLike, BaseFile[TableT]], Optional[PathLike])->None
if isinstance(table_path, BaseFile):
# copy constructor
assert info_path is None # or invalid use of copy constructor
self.table_path = table_path.table_path # type: pathlib.Path
self.info_path = table_path.info_path # type: pathlib.Path
self.info = table_path.info # type: FileInfo
self.raw_data = table_path.raw_data # type: pandas.DataFrame
self.tables = table_path.tables # type: MutableMapping[str, TableT]
return
self.table_path = pathlib.Path(table_path)
self.info_path = pathlib.Path(
info_path if info_path else self.table_path.with_suffix(".info")
)
self.info = FileInfo.load(self.info_path)
self.raw_data = self._read_csv(self.table_path)
# validate annotation before actual load
self.info.validate()
# and do actual loading
self.tables = self._parse_data()
self.validate()
def _read_csv(self, path):
# type: (pathlib.Path)->pandas.DataFrame
"""Read a csv file and return the content.
Internally, call `pandas.read_csv` with `!reader_options`.
"""
reader_options = {
"skiprows": [0],
"names": [c.name for c in self.info.columns],
} # default values
reader_options.update(self.info.reader_options)
return pandas.read_csv(path, **reader_options)
def _parse_data(self):
# type: ()->MutableMapping[str, TableT]
"""Load and prepare data from the specified paths."""
tables = {} # type: MutableMapping[str, TableT]
def calc(row, unc_sources, sign):
# type: (pandas.Series, List[UncSpecType], int)->float
"""Calculate uncertainty from a row in normalized dataframe."""
unc_components = [] # type: List[float]
for source, unc_type in unc_sources: # iterate over sources
if "signed" in unc_type.split(","):
# use only the correct-signed uncertainties
unc_candidates = [abs(row[c]) for c in source if row[c] * sign > 0]
else:
unc_candidates = [abs(row[c]) for c in source]
unc_components.append(max(unc_candidates) if unc_candidates else 0)
return sum(i ** 2 for i in unc_components) ** 0.5 # type: ignore
for value_info in self.info.values:
name = value_info.column
data = self._prepare_normalized_data(value_info)
tables[name] = cast(TableT, BaseTable(file=self, name=name))
tables[name]["value"] = data[name]
for key, row in data.iterrows():
tables[name].loc[key, "unc+"] = calc(row, value_info.unc_p, +1)
tables[name].loc[key, "unc-"] = calc(row, value_info.unc_m, -1)
return tables
def _prepare_normalized_data(self, value_info):
# type: (ValueInfo)->pandas.DataFrame
"""Quantize parameters and normalize columns to value_info.column."""
data = self.raw_data.copy()
def quantize(data_frame, granularity):
# type: (pandas.DataFrame, float)->pandas.DataFrame
return (data_frame / granularity).apply(round) * granularity
# set index by the quantized values
for p in self.info.parameters:
if p.granularity:
data[p.column] = quantize(data[p.column], p.granularity)
data.set_index([p.column for p in self.info.parameters], inplace=True)
# collect columns to use
abs_columns = set() # type: Set[str]
rel_columns = set() # type: Set[str]
for unc_cols, unc_type in itertools.chain(value_info.unc_p, value_info.unc_m):
is_relative = "relative" in unc_type.split(",")
for c in unc_cols:
(rel_columns if is_relative else abs_columns).add(c)
assert abs_columns.isdisjoint(rel_columns)
name = value_info.column
value_unit = Unit(self.info.get_column(name).unit)
for col in data.columns:
if col == value_info.column:
pass
elif col in abs_columns:
# unc / unc_unit == "number in the table"
# we want to get "unc / value_unit"
# = "number in the table" * unc_unit / value_unit
unc_unit = Unit(self.info.get_column(col).unit)
data[col] = data[col] * float(unc_unit / value_unit)
elif col in rel_columns:
unc_unit = Unit(self.info.get_column(col).unit) * value_unit
data[col] = data[name] * data[col] * float(unc_unit / value_unit)
else:
data.drop(col, axis=1, inplace=True)
return data
[docs] def validate(self):
# type: ()->None
"""Validate the Table data."""
for key, table in self.tables.items():
duplication = table.index[table.index.duplicated()]
for d in duplication:
raise ValueError("Found duplicated entries: %s, %s", key, d)
if len(duplication) > 5:
raise ValueError("Maybe parameter granularity is set too large?")
# ------------------ #
# accessor functions #
# ------------------ #
[docs] def __getitem__(self, key):
# type: (str)->BaseTable
"""Return the specied table data.
Arguments
---------
key: str
One of The key of the data to return.
Returns
-------
pandas.DataFrame
One of the data tables specified by :ar:`key`.
"""
return self.tables[key]
[docs] def dump(self, keys=None):
# type: (Optional[List[str]])->str
"""Return the dumped string of the data tables.
Arguments
---------
keys: list of str, optional
if specified, specified data are only dumped.
Returns
-------
str
Dumped data.
"""
results = [] # type: List[str]
line = "-" * 72
keys_to_show = self.tables.keys() if keys is None else keys
for k in keys_to_show:
results.append(line)
results.append('TABLE "{}" (unit: {})'.format(k, self.tables[k].unit))
results.append(line)
results.append(self.tables[k].__str__()) # py2
results.append("")
results.append(line)
for k, v in self.info.document.items():
results.append(u"{}: {}".format(k, v))
results.append(line)
return "\n".join(results)