"""Classes to describe annotations of general-purpose tables.
This module provides annotation classes for CSV-like table data. The data is a
two-dimensional table and represents functions over a parameter space. Some
columns represent parameters and others do values. Each row represents a single
data point and corresponding value.
Two structural annotations and two semantic annotations are defined. `FileInfo`
and `ColumnInfo` are structural, which respectively annotate the whole file and
each columns. For semantics, `ParameterInfo` collects the information of
parameters, each of which is a column, and `ValueInfo` is for a value. A value
may be given by multiple columns if, for example, the value has uncertainties
or the value is given by the average of two columns.
"""
from __future__ import absolute_import, division, print_function # py2
import itertools
import json
import logging
import pathlib # noqa: F401
from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Tuple, Union
from susy_cross_section.utility import TypeCheck as TC
JSONDecodeError = json.decoder.JSONDecodeError
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
UncSpecType = Tuple[List[str], str]
[docs]class ColumnInfo(object):
"""Stores information of a column.
Instead of the :typ:`int` identifier `!index`, we use `!name` as the
principal identifier for readability. We also annotate a column by `!unit`,
which is :typ:`str` that is passed to `Unit()`.
Attributes
----------
index : int
The zero-based index of column.
The columns of a file should have valid `!index`, i.e., no overlap, no
gap, and starting from zero.
name : str
The human-readable and machine-readable name of the column.
As it is used as the identifier, it should be unique in one file.
unit : str
The unit of column, or empty string if the column has no unit.
The default value is an empty str ``''``, which means the column has no
unit. Internally this is passed to `Unit()`.
Note
----
As for now, `!unit` is restricted as a str object, but in future a float
should be allowed to describe "x1000" etc.
"""
def __init__(self, index, name, unit=""):
# type: (int, str, str)->None
self.index = index # type: int
self.name = name # type: str
self.unit = unit or "" # type: str
[docs] @classmethod
def from_json(cls, json_obj):
# type: (Any)->ColumnInfo
"""Initialize an instance from valid json data.
Parameters
----------
json_obj: Any
a valid json object.
Returns
-------
ColumnInfo
Constructed instance.
Raises
------
ValueError
If :ar:`json_obj` has invalid data.
"""
try:
obj = cls(
index=json_obj["index"],
name=json_obj["name"],
unit=json_obj.get("unit", ""),
)
except (TypeError, AttributeError) as e:
logger.critical("ColumnInfo.from_json caught an exception.", exc_info=e)
raise ValueError("Invalid data passed to ColumnInfo.from_json: %s")
except KeyError as e:
logger.critical("ColumnInfo.from_json caught an exception.", exc_info=e)
raise ValueError("ColumnInfo data missing: %s", e)
for k in json_obj.keys():
if k not in ["index", "name", "unit"]:
logger.warning("Unknown data for ColumnInfo.from_json: %s", k)
obj.validate()
return obj
[docs] def to_json(self):
# type: ()->MutableMapping[str, Union[str, int]]
"""Serialize the object to a json data.
Returns
-------
dict(str, str or int)
The json data describing the object.
"""
json_obj = {
"index": self.index,
"name": self.name,
} # type: MutableMapping[str, Union[str, int]]
if self.unit:
json_obj["unit"] = self.unit
return json_obj
[docs] def validate(self):
# type: ()->None
"""Validate the content.
Raises
------
TypeError
If any attributes are invalid type of instance.
ValueError
If any attributes have invalid content.
"""
if not isinstance(self.index, int):
raise TypeError("ColumnInfo.index must be int: %s", self.index)
if not self.index >= 0:
raise ValueError("ColumnInfo.index must be non-negative: %s", self.index)
if not isinstance(self.name, str):
raise TypeError("Col %d: `name` must be string: %s", self.index, self.name)
if not self.name:
raise ValueError("Column %d: `name` missing", self.index)
if not isinstance(self.unit, str):
raise TypeError("Col %d: `unit` must be string: %s", self.index, self.unit)
[docs]class ParameterInfo(object):
"""Stores information of a parameter.
A parameter set defines a data point for the functions described by the
file. A parameter set has one or more parameters, each of which
corresponds to a column of the file. The `!column` attribute has
:attr:`ColumnInfo.name` of the column.
Since the parameter value is read from an ASCII file, :typ:`float` values
might have round-off errors, which might cause grid misalignments in grid-
based interpolations. To have the same :typ:`float` expression on the
numbers that should be on the same grid, `!granularity` should be provided.
Attributes
----------
column: str
Name of the column that stores this parameter.
granularity: int or float, optional
Assumed presicion of the parameter.
This is used to round the parameter so that a data point should be
exactly on the grid. Internally, a parameter is rounded to::
round(value / granularity) * granularity
For example, for a grid ``[10, 20, 30, 50, 70]``, it should be set to
10 (or 5, 1, 0.1, etc.), while for ``[33.3, 50, 90]``, it should be
0.01.
"""
def __init__(self, column="", granularity=None):
# type: (str, float)->None
self.column = column # type: str
self.granularity = granularity or None # type: Optional[float]
[docs] @classmethod
def from_json(cls, json_obj):
# type: (Any)->ParameterInfo
"""Initialize an instance from valid json data.
Parameters
----------
json_obj: Any
a valid json object.
Returns
-------
ParameterInfo
Constructed instance.
Raises
------
ValueError
If :ar:`json_obj` has invalid data.
"""
try:
obj = cls(
column=json_obj["column"], granularity=json_obj.get("granularity")
)
except (TypeError, AttributeError) as e:
logger.critical("ParameterInfo.from_json caught an exception.", exc_info=e)
raise ValueError("Invalid data passed to ParameterInfo.from_json: %s")
except KeyError as e:
logger.critical("ParameterInfo.from_json caught an exception.", exc_info=e)
raise ValueError("ColumnInfo data missing: %s", e)
for k in json_obj.keys():
if k not in ["column", "granularity"]:
logger.warning("Unknown data for ParameterInfo.from_json: %s", k)
obj.validate()
return obj
[docs] def to_json(self):
# type: ()->MutableMapping[str, Union[str, float]]
"""Serialize the object to a json data.
Returns
-------
dict(str, str or float)
The json data describing the object.
"""
json_obj = {"column": self.column} # type: Dict[str, Union[str, float]]
if self.granularity:
json_obj["granularity"] = self.granularity
return json_obj
[docs] def validate(self):
# type: ()->None
"""Validate the content.
Raises
------
TypeError
If any attributes are invalid type of instance.
ValueError
If any attributes have invalid content.
"""
assert isinstance(self.column, str), "ParameterInfo.column must be string."
assert self.column, "ParameterInfo.column is missing"
if self.granularity is not None:
assert hasattr(self.granularity, "__float__"), "Granularity not a number."
assert float(self.granularity) > 0, "Negative granularity."
[docs]class ValueInfo(object):
"""Stores information of value accompanied by uncertainties.
A value is generally composed from several columns. In current
implementation, the central value must be given by one column, whose name
is specified by :attr:`column`. The positive- and negative-direction
uncertainties are specified by `!unc_p` and `!unc_m`, respectively, which
are :typ:`dict(str, str)`.
Attributes
----------
column: str or List[str]
Names of the column that stores this value.
The string, or each element of the list, must match one of the
:attr:`ColumnInfo.name` in the file. If multiple columns are specified,
the largest value among the columns (compared in each row) is used.
attributes: dict (str, Any)
Physical information annotated to this value.
unc_p : dict (str, str)
The sources of "plus" uncertainties.
Multiple uncertainty sources can be specified. Each key corresponds
:attr:`ColumnInfo.name` of the source column, and each value denotes
the "type" of the source. Currently, two types are implementend:
- ``"relative"`` for relative uncertainty, where the unit of the column
must be dimension-less.
- ``"absolute"`` for absolute uncertainty, where the unit of the column
must be the same as that of the value column up to a factor.
- ``"absolute,signed"`` or ``"relative,signed"`` for absolute/relative
uncertainty but using the columns with correct sign.
unc_m : dict(str, str)
The sources of "minus" uncertainties.
Details are the same as `!unc_p`.
"""
_valid_uncertainty_types = [
"relative",
"absolute",
"signed,relative",
"signed,absolute",
"relative,signed",
"absolute,signed",
] # type: List[str]
def __init__(
self,
column="", # type: str
attributes=None, # type:MutableMapping[str, Any]
unc_p=None, # type: List[UncSpecType]
unc_m=None, # type: List[UncSpecType]
):
# type: (...)->None
self.column = column
self.attributes = attributes or {}
self.unc_p = unc_p or []
self.unc_m = unc_m or []
[docs] def validate(self):
# type: ()->None
"""Validate the content."""
assert isinstance(self.column, str), "ValueInfo.column must be string."
assert self.column, "ValueInfo.column is missing."
assert TC.is_dict(self.attributes, key_type=str), "attributes not dict[str]."
for col, t in itertools.chain(self.unc_p, self.unc_m):
assert TC.is_list(col, element_type=str)
assert t in self._valid_uncertainty_types, "invalid unc type: %s" % t
[docs] @classmethod
def from_json(cls, json_obj):
# type: (Any)->ValueInfo
"""Initialize an instance from valid json data.
Parameters
----------
json_obj: typing.Any
a valid json object.
Returns
-------
ValueInfo
Constructed instance.
Raises
------
ValueError
If :ar:`json_obj` has invalid data.
"""
if not TC.is_dict(json_obj):
raise TypeError('Entry of "values" must be a dict: %s', json_obj)
if "column" not in json_obj:
raise KeyError('Entry of "values" must have a key "column": %s', json_obj)
obj = cls()
obj.column = json_obj["column"]
obj.attributes = json_obj.get("attributes", {})
if ("unc" in json_obj) and ("unc+" in json_obj or "unc-" in json_obj):
raise ValueError("Uncertainty duplicates: %s", obj.column)
for attr_name, key_name in [("unc_p", "unc+"), ("unc_m", "unc-")]:
unc_def = json_obj.get(key_name) or json_obj.get("unc") or None
if unc_def is None:
logger.warning("Uncertainty (%s) missing for %s.", key_name, obj.column)
continue
assert TC.is_list(unc_def, Mapping), "bad %s/%s" % (key_name, obj.column)
try:
unc_list = [
(
src["column"] if TC.is_list(src["column"]) else [src["column"]],
src["type"],
)
for src in unc_def
]
except KeyError as e:
raise ValueError("%s missing in %s (%s)", key_name, obj.column, *e.args)
setattr(obj, attr_name, unc_list)
if not (obj.unc_p and obj.unc_m):
logger.warning("Value %s lacks uncertainties.", obj.column)
return obj
[docs] def to_json(self):
# type: ()->MutableMapping[str, Any]
"""Serialize the object to a json data.
Returns
-------
dict(str, str or float)
The json data describing the object.
"""
return {
"column": self.column,
"attributes": self.attributes,
"unc+": [{"column": c, "type": t} for c, t in self.unc_p],
"unc-": [{"column": c, "type": t} for c, t in self.unc_m],
}
[docs]class FileInfo(object):
"""Stores file-wide annotations.
A table structure is given by `!columns`, while in semantics a table
consists of `!parameters` and `!values`. The information about them is
stored as lists of `ColumnInfo`, `ParameterInfo`, and `ValueInfo` objects.
In addition, `!reader_options` can be specified, which is directly passed
to :func:`pandas.read_csv`.
The attribute `!document` is provided just for documentation. The
information is guaranteed not to modify any functionality of codes or
packages, and thus can be anything.
Developers must not use `!document` information except for displaying them.
If one needs to interpret some information, one should extend this class to
provide other data-storage for such information.
Attributes
----------
document : dict(Any, Any)
Any information for documentation without physical meanings.
columns : list of ColumnInfo
The list of columns.
parameters: list of ParameterInfo
The list of parameters to define a data point.
values: list of ValueInfo
The list of values described in the file.
reader_options: dict(str, Any)
Options to read the CSV
The values are directly passed to :func:`pandas.read_csv` as keyword
arguments, so all the options of :func:`pandas.read_csv` are available.
"""
def __init__(
self,
document=None, # type: Mapping[Any, Any]
columns=None, # type: List[ColumnInfo]
parameters=None, # type: List[ParameterInfo]
values=None, # type: List[ValueInfo]
reader_options=None, # type: Mapping[str, Any]
):
# type: (...)->None
self.document = document or {}
self.columns = columns or []
self.parameters = parameters or []
self.values = values or []
self.reader_options = reader_options or {}
[docs] def validate(self):
# type: ()->None
"""Validate the content."""
assert TC.is_dict(self.document), "document must be a dict."
for name in ["columns", "parameters", "values"]:
assert TC.is_list(getattr(self, name)), "FileInfo.%s must be a list" % name
for obj in getattr(self, name):
obj.validate()
assert TC.is_dict(
self.reader_options, key_type=str
), "reader_options must be a dict(str, Any)."
# validate columns (`index` matches actual index, names are unique)
names_dict = {} # type: MutableMapping[str, bool]
for i, col in enumerate(self.columns):
assert col.index == i, "Mismatched column index: %d/%d" % (i, col.index)
assert col.name not in names_dict, "Duplicated column name: " + col.name
names_dict[col.name] = True
# validate params and values
for p in self.parameters:
assert p.column in names_dict, "Unknown column name: %s" % p.column
for v in self.values:
assert v.column in names_dict, "Unknown column name: %s" % v.column
for col_list, _ in itertools.chain(v.unc_p, v.unc_m):
for c in col_list:
assert c in names_dict, "Unknown column name: %s" % c
[docs] @classmethod
def load(cls, source):
# type: (Union[pathlib.Path, str])->FileInfo
"""Load and construct FileInfo from a json file.
Parameters
----------
source: pathlib.Path or str
Path to the json file.
Returns
-------
FileInfo
Constructed instance.
"""
obj = cls()
with open(source.__str__()) as f: # py2
obj._load(**(json.load(f)))
obj.validate()
return obj
def _load(self, **kw):
# type: (Any)->None
"""Load and construct FileInfo from keyword arguments.
Note that file-level "attributes" are passed to each `ValueInfo` object
as the default values and overwritten by value-level "attributes".
"""
self.document = kw.get("document") or {}
self.reader_options = kw.get("reader_options") or {}
self.columns = [
ColumnInfo(index=i, name=c.get("name"), unit=c.get("unit"))
for i, c in enumerate(kw.get("columns") or [])
]
self.parameters = [
ParameterInfo.from_json(p) for p in kw.get("parameters") or []
]
self.values = [ValueInfo.from_json(p) for p in kw.get("values") or []]
# re-set values.attributes using the default attributes.
default_attributes = kw.get("attributes") or {}
for v in self.values:
# py2
orig = v.attributes
v.attributes = default_attributes.copy()
v.attributes.update(orig)
# emit warnings
if not self.document:
logger.warning("No document is given.")
for key in kw:
if key not in [
"document",
"columns",
"parameters",
"values",
"reader_options",
"attributes",
]:
logger.warning('Unrecognized attribute "%s"', key)
[docs] def get_column(self, name):
# type: (str)->ColumnInfo
"""Return a column with specified name.
Return `ColumnInfo` of a column with name :ar:`name`.
Arguments
---------
name
The name of column to get.
Returns
-------
ColumnInfo
The column with name :ar:`name`.
Raises
------
KeyError
If no column is found.
"""
for c in self.columns:
if c.name == name:
return c
raise KeyError(name)