Source code for susy_cross_section.base.table

"""Tables representing values with asymmetric uncertainties.

This module provides a class to handle CSV-like table data representing values
with asymmetric uncertainties. Such tables are provided in various format; for
example, the uncertainty may be relative or absolute, or with multiple sources.
The class :class:`BaseFile` interprets such tables based on `FileInfo`
annotations.
"""

from __future__ import absolute_import, division, print_function  # py2

import itertools
import json
import logging
import pathlib  # noqa: F401
import sys
from typing import (  # noqa: F401
    Any,
    Generic,
    List,
    Mapping,
    MutableMapping,
    Optional,
    Sequence,
    Set,
    TypeVar,
    Union,
    cast,
)

import pandas
import numpy

from susy_cross_section.base.info import FileInfo, UncSpecType, ValueInfo
from susy_cross_section.utility import Unit

if sys.version_info[0] < 3:  # py2
    str = basestring  # noqa: A001, F821
    JSONDecodeError = Exception
else:
    JSONDecodeError = json.decoder.JSONDecodeError


logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

PathLike = Union[pathlib.Path, str]
TableT = TypeVar("TableT", bound="BaseTable", covariant=True)


[docs]class BaseTable(object): """Table object with annotations. This is a wrapper class of :class:`pandas.DataFrame`. Any methods except for read/write of `!file` are delegated to the DataFrame object. Attributes ---------- file: BaseFile, optional File object containing this table. name: str, optional Name of this table. This is provided so that `ValueInfo` can be obtained from `!file`. """ def __init__(self, obj=None, file=None, name=None): # type:(pandas.DataFrame, Optional[BaseFile[BaseTable]], Optional[str])->None if isinstance(obj, pandas.DataFrame): self._df = obj # type: pandas.DataFrame else: self._df = pandas.DataFrame() self.file = file # type: Optional[BaseFile[BaseTable]] self.name = name # type: Optional[str]
[docs] def __getattr__(self, name): # type: (str)->Any """Fall-back method to delegate any operations to the DataFrame.""" return self._df.__getattr__(name)
[docs] def __setitem__(self, name, obj): # type: (str, Any)->Any """Perform DataFrame.__setitem__.""" return self._df.__setitem__(name, obj)
[docs] def __getitem__(self, name): # type: (str)->Any """Perform DataFrame.__getitem__.""" return self._df.__getitem__(name)
[docs] def __str__(self): # type: ()->str """Dump the data-frame.""" return cast(str, self._df.__str__())
[docs] def header(self): # type: ()->List[Any] """Return the header of DataFrame regarded as a table.""" return list(self._df.index.names) + list(self._df.columns)
[docs] def to_records(self): # type: ()->numpy.record """Export the data-frame to a plain list.""" return self._df.to_records() # type: ignore
[docs]class BaseFile(Generic[TableT]): """File with table data-sets and annotations. An instance has two main attributes: `!info` (:typ:`FileInfo`) as the annotation and `!tables` (:typ:`dict` of :typ:`BaseTable`) as the data tables. Arguments --------- table_path: str or pathlib.Path Path to the csv data file. info_path: str or pathlib.Path, optional Path to the corresponding info file. If unspecified, `!table_path` with suffix changed to ``".info"`` is used. Attributes ---------- table_path: pathlib.Path Path to the csv data file. info_path: pathlib.Path Path to the info file. raw_data: pandas.DataFrame the content of `!table_path`. info: FileInfo the content of `!info_path`. tables: dict(str, BaseTable) The table parsed according to the annotation. Each value is practically a `pandas.DataFrame` object and indexed according to the parameter specified in `!info`, having exactly three value-columns: ``"value"``, ``"unc+"``, and ``"unc-"`` for the central value and positive- and negative- directed **absolute** uncertainty, respectively. The content of ``"unc-"`` is non-positive. """ def __init__(self, table_path, info_path=None): # type: (Union[PathLike, BaseFile[TableT]], Optional[PathLike])->None if isinstance(table_path, BaseFile): # copy constructor assert info_path is None # or invalid use of copy constructor self.table_path = table_path.table_path # type: pathlib.Path self.info_path = table_path.info_path # type: pathlib.Path self.info = table_path.info # type: FileInfo self.raw_data = table_path.raw_data # type: pandas.DataFrame self.tables = table_path.tables # type: MutableMapping[str, TableT] return self.table_path = pathlib.Path(table_path) self.info_path = pathlib.Path( info_path if info_path else self.table_path.with_suffix(".info") ) self.info = FileInfo.load(self.info_path) self.raw_data = self._read_csv(self.table_path) # validate annotation before actual load self.info.validate() # and do actual loading self.tables = self._parse_data() self.validate() def _read_csv(self, path): # type: (pathlib.Path)->pandas.DataFrame """Read a csv file and return the content. Internally, call `pandas.read_csv` with `!reader_options`. """ reader_options = { "skiprows": [0], "names": [c.name for c in self.info.columns], } # default values reader_options.update(self.info.reader_options) return pandas.read_csv(path, **reader_options) def _parse_data(self): # type: ()->MutableMapping[str, TableT] """Load and prepare data from the specified paths.""" tables = {} # type: MutableMapping[str, TableT] def calc(row, unc_sources, sign): # type: (pandas.Series, List[UncSpecType], int)->float """Calculate uncertainty from a row in normalized dataframe.""" unc_components = [] # type: List[float] for source, unc_type in unc_sources: # iterate over sources if "signed" in unc_type.split(","): # use only the correct-signed uncertainties unc_candidates = [abs(row[c]) for c in source if row[c] * sign > 0] else: unc_candidates = [abs(row[c]) for c in source] unc_components.append(max(unc_candidates) if unc_candidates else 0) return sum(i ** 2 for i in unc_components) ** 0.5 # type: ignore for value_info in self.info.values: name = value_info.column data = self._prepare_normalized_data(value_info) tables[name] = cast(TableT, BaseTable(file=self, name=name)) tables[name]["value"] = data[name] for key, row in data.iterrows(): tables[name].loc[key, "unc+"] = calc(row, value_info.unc_p, +1) tables[name].loc[key, "unc-"] = calc(row, value_info.unc_m, -1) return tables def _prepare_normalized_data(self, value_info): # type: (ValueInfo)->pandas.DataFrame """Quantize parameters and normalize columns to value_info.column.""" data = self.raw_data.copy() def quantize(data_frame, granularity): # type: (pandas.DataFrame, float)->pandas.DataFrame return (data_frame / granularity).apply(round) * granularity # set index by the quantized values for p in self.info.parameters: if p.granularity: data[p.column] = quantize(data[p.column], p.granularity) data.set_index([p.column for p in self.info.parameters], inplace=True) # collect columns to use abs_columns = set() # type: Set[str] rel_columns = set() # type: Set[str] for unc_cols, unc_type in itertools.chain(value_info.unc_p, value_info.unc_m): is_relative = "relative" in unc_type.split(",") for c in unc_cols: (rel_columns if is_relative else abs_columns).add(c) assert abs_columns.isdisjoint(rel_columns) name = value_info.column value_unit = Unit(self.info.get_column(name).unit) for col in data.columns: if col == value_info.column: pass elif col in abs_columns: # unc / unc_unit == "number in the table" # we want to get "unc / value_unit" # = "number in the table" * unc_unit / value_unit unc_unit = Unit(self.info.get_column(col).unit) data[col] = data[col] * float(unc_unit / value_unit) elif col in rel_columns: unc_unit = Unit(self.info.get_column(col).unit) * value_unit data[col] = data[name] * data[col] * float(unc_unit / value_unit) else: data.drop(col, axis=1, inplace=True) return data
[docs] def validate(self): # type: ()->None """Validate the Table data.""" for key, table in self.tables.items(): duplication = table.index[table.index.duplicated()] for d in duplication: raise ValueError("Found duplicated entries: %s, %s", key, d) if len(duplication) > 5: raise ValueError("Maybe parameter granularity is set too large?")
# ------------------ # # accessor functions # # ------------------ #
[docs] def __getitem__(self, key): # type: (str)->BaseTable """Return the specied table data. Arguments --------- key: str One of The key of the data to return. Returns ------- pandas.DataFrame One of the data tables specified by :ar:`key`. """ return self.tables[key]
[docs] def dump(self, keys=None): # type: (Optional[List[str]])->str """Return the dumped string of the data tables. Arguments --------- keys: list of str, optional if specified, specified data are only dumped. Returns ------- str Dumped data. """ results = [] # type: List[str] line = "-" * 72 keys_to_show = self.tables.keys() if keys is None else keys for k in keys_to_show: results.append(line) results.append('TABLE "{}" (unit: {})'.format(k, self.tables[k].unit)) results.append(line) results.append(self.tables[k].__str__()) # py2 results.append("") results.append(line) for k, v in self.info.document.items(): results.append(u"{}: {}".format(k, v)) results.append(line) return "\n".join(results)