Custom Pandas Accessor#

Module containing a custom accessor and helpers for querying lydata.

lydata.accessor.get_all_true(df: DataFrame) Series[source]#

Return a mask with all entries set to True.

class lydata.accessor.CombineQMixin[source]#

Mixin class for combining queries.

class lydata.accessor.Q(column: str, operator: Literal['==', '<', '<=', '>', '>=', '!=', 'in'], value: Any)[source]#

Combinable query object for filtering a DataFrame.

execute(df: DataFrame) Series[source]#

Return a boolean mask where the query is satisfied for df.

class lydata.accessor.AndQ(q1: Q | AndQ | OrQ | NotQ | None, q2: Q | AndQ | OrQ | NotQ | None)[source]#

Query object for combining two queries with a logical AND.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q1 = Q('col1', '>', 1)
>>> q2 = Q('col1', '<', 3)
>>> and_q = q1 & q2
>>> print(and_q)
Q('col1', '>', 1) & Q('col1', '<', 3)
>>> isinstance(and_q, AndQ)
True
>>> and_q.execute(df)
0    False
1     True
2    False
Name: col1, dtype: bool
execute(df: DataFrame) Series[source]#

Return a boolean mask where both queries are satisfied.

class lydata.accessor.OrQ(q1: Q | AndQ | OrQ | NotQ | None, q2: Q | AndQ | OrQ | NotQ | None)[source]#

Query object for combining two queries with a logical OR.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q1 = Q('col1', '==', 1)
>>> q2 = Q('col1', '==', 3)
>>> or_q = q1 | q2
>>> print(or_q)
Q('col1', '==', 1) | Q('col1', '==', 3)
>>> isinstance(or_q, OrQ)
True
>>> or_q.execute(df)
0     True
1    False
2     True
Name: col1, dtype: bool
execute(df: DataFrame) Series[source]#

Return a boolean mask where either query is satisfied.

class lydata.accessor.NotQ(q: Q | AndQ | OrQ | NotQ | None)[source]#

Query object for negating a query.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q = Q('col1', '==', 2)
>>> not_q = ~q
>>> print(not_q)
~Q('col1', '==', 2)
>>> isinstance(not_q, NotQ)
True
>>> not_q.execute(df)
0     True
1    False
2     True
Name: col1, dtype: bool
execute(df: DataFrame) Series[source]#

Return a boolean mask where the query is not satisfied.

class lydata.accessor.NoneQ[source]#

Query object that always returns the entire DataFrame. Useful as default.

execute(df: DataFrame) Series[source]#

Return a boolean mask with all entries set to True.

class lydata.accessor.QueryPortion(match: int, total: int)[source]#

Dataclass for storing the portion of a query.

property fail: int#

Get the number of failures.

>>> QueryPortion(2, 5).fail
3
property ratio: float#

Get the ratio of matches over the total.

>>> QueryPortion(2, 5).ratio
0.4
property percent: float#

Get the percentage of matches over the total.

>>> QueryPortion(2, 5).percent
40.0
invert() QueryPortion[source]#

Return the inverted portion.

>>> QueryPortion(2, 5).invert()
QueryPortion(match=3, total=5)
lydata.accessor.align_diagnoses(dataset: DataFrame, modalities: list[str]) list[DataFrame][source]#

Stack aligned diagnosis tables in dataset for each of modalities.

lydata.accessor.create_raising_func(method: str)[source]#

Raise ValueError for wrong method.

lydata.accessor.false_estimate(obs: ndarray, false_pos_probs: ndarray, true_neg_probs: ndarray, method: Literal['prod', 'max']) float[source]#

Compute estimate of False, given obs.

>>> false_estimate([True, False], [0.1, 0.6], [0.4, 0.7], method="whatever")
Traceback (most recent call last):
    ...
ValueError: Unknown method whatever
lydata.accessor.true_estimate(obs: ndarray, true_pos_probs: ndarray, false_neg_probs: ndarray, method: Literal['prod', 'max']) float[source]#

Compute estimate of True, given obs.

>>> obs = [True, False, np.nan]
>>> true_pos_probs = [0.8, 0.6, 0.9]
>>> false_neg_probs = [0.6, 0.7, 0.9]
>>> true_estimate(obs, true_pos_probs, false_neg_probs, method="max")
np.float64(0.8)
>>> tmp = true_estimate(obs, true_pos_probs, false_neg_probs, method="prod")
>>> np.isclose(tmp, 0.56)
np.True_
lydata.accessor.max_likelihood(obs: ndarray, specificities: ndarray, sensitivities: ndarray) bool[source]#

Compute most likely true state based on all obs.

>>> obs = np.array([True, False, np.nan, None])
>>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
>>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
>>> max_likelihood(obs, sensitivities, specificities)
np.True_
>>> obs = np.array([True, False, False, False])
>>> max_likelihood(obs, sensitivities, specificities)
np.False_
lydata.accessor.rank_trustworthy(obs: ndarray, specificities: ndarray, sensitivities: ndarray) bool[source]#

Estimate true state based on most trustworthy value in obs.

>>> obs = np.array([True, False, np.nan, None])
>>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
>>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
>>> rank_trustworthy(obs, sensitivities, specificities)
np.True_
>>> obs = np.array([True, False, False, False])
>>> rank_trustworthy(obs, sensitivities, specificities)
np.True_
lydata.accessor.expand_mapping(short_map: dict[str, Any], colname_map: dict[str | tuple[str, str, str], Any] | None = None) dict[tuple[str, str, str], Any][source]#

Expand the column map to full column names.

>>> expand_mapping({'age': 'foo', 'hpv': 'bar'})
{('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'}
class lydata.accessor.LyDataAccessor(obj: DataFrame)[source]#

Custom accessor for handling lymphatic involvement data.

This aims to provide an easy and user-friendly interface to the most commonly needed operations on the lymphatic involvement data we publish in the lydata project.

validate(modalities: list[str] | None = None) DataFrame[source]#

Validate the DataFrame against the lydata schema.

The schema is constructed by the construct_schema() function using the modalities provided or it will get_default_modalities() if None are provided.

get_modalities(_filter: list[str] | None = None) list[str][source]#

Return the modalities present in this DataFrame.

Warning

This method assumes that all top-level columns are modalities, except for some predefined non-modality columns. For some custom dataset, this may not be correct. In that case, you should provide a list of columns to _filter, i.e., the columns that are not modalities.

query(query: Q | AndQ | OrQ | NotQ | None = None) DataFrame[source]#

Return a DataFrame with rows that satisfy the query.

A query is a Q object that can be combined with logical operators. See this class’ documentation for more information.

portion(query: Q | AndQ | OrQ | NotQ | None = None, given: Q | AndQ | OrQ | NotQ | None = None) QueryPortion[source]#

Compute how many rows satisfy a query, given some other conditions.

This returns a QueryPortion object that contains the number of rows satisfying the query and given Q object divided by the number of rows satisfying only the given condition.

>>> df = pd.DataFrame({'x': [1, 2, 3]})
>>> df.ly.portion(query=Q('x', '==', 2), given=Q('x', '>', 1))
QueryPortion(match=np.int64(1), total=np.int64(2))
>>> df.ly.portion(query=Q('x', '==', 2), given=Q('x', '>', 3))
QueryPortion(match=np.int64(0), total=np.int64(0))
stats(agg_funcs: dict[str | tuple[str, str, str], Callable[[Series], Series]] | None = None, use_shortnames: bool = True, out_format: str = 'dict') Any[source]#

Compute statistics.

The agg_funcs argument is a mapping of column names to functions that receive a pd.Series and return a pd.Series. The default is a useful selection of statistics for the most common columns. E.g., for the column ('patient', '#', 'age') (or its short column name age), the default function returns the value counts.

The use_shortnames argument determines whether the output should use the short column names or the long ones. The default is to use the short names.

With out_format one can specify the output format. Available options are those formats for which pandas has a to_<format> method.

>>> df = pd.DataFrame({
...     ('patient', '#', 'age'): [61, 52, 73, 61],
...     ('patient', '#', 'hpv_status'): [True, False, None, True],
...     ('tumor', '1', 't_stage'): [2, 3, 1, 2],
... })
>>> df.ly.stats()   
{'age': {61: 2, 52: 1, 73: 1},
 'hpv': {True: 2, False: 1, None: 1},
 't_stage': {2: 2, 3: 1, 1: 1}}
combine(modalities: dict[str, ModalityConfig] | None = None, method: Literal['max_llh', 'rank'] = 'max_llh') DataFrame[source]#

Combine diagnoses of modalities using method.

The details of what the method does and how can be found in their respective documentations: max_likelihood() and rank_trustworthy().

The result contains only the combined columns. The intended use is to update() the original DataFrame with the result.

>>> df = pd.DataFrame({
...     ('MRI'      , 'ipsi', 'I'): [False, True , True , None],
...     ('CT'       , 'ipsi', 'I'): [False, True , False, True],
...     ('pathology', 'ipsi', 'I'): [True , None , False, None],
... })
>>> df.ly.combine()   
     ipsi
        I
0    True
1    True
2   False
3    True
infer_sublevels(modalities: list[str] | None = None, sides: list[Literal['ipsi', 'contra']] | None = None, subdivisions: dict[str, list[str]] | None = None) DataFrame[source]#

Determine involvement status of an LNL’s sublevels (e.g., IIa and IIb).

Some LNLs have sublevels, e.g., IIa and IIb. The involvement of these sublevels is not always reported, but only the superlevel’s status. This function infers the status of the sublevels from the superlevel.

The sublevel’s status is computed for the specified modalities. If and what sublevels a superlevel has, is specified in subdivisions. The default subdivisions argument looks like this:

{
    "I": ["a", "b"],
    "II": ["a", "b"],
    "V": ["a", "b"],
}

The resulting DataFrame will only contain the newly inferred sublevel columns. Thus, one can simply update() the original DataFrame with the result.

>>> df = pd.DataFrame({
...     ('MRI', 'ipsi'  , 'I' ): [True , False, False, None],
...     ('MRI', 'contra', 'I' ): [False, True , False, None],
...     ('MRI', 'ipsi'  , 'II'): [False, False, True , None],
...     ('MRI', 'ipsi'  , 'IV'): [False, False, True , None],
...     ('CT' , 'ipsi'  , 'I' ): [True , False, False, None],
... })
>>> df.ly.infer_sublevels(modalities=["MRI"])   
     MRI
    ipsi                      contra
      Ia     Ib    IIa    IIb     Ia     Ib
0   None   None  False  False  False  False
1  False  False  False  False   None   None
2  False  False   None   None  False  False
3   None   None   None   None   None   None
infer_superlevels(modalities: list[str] | None = None, sides: list[Literal['ipsi', 'contra']] | None = None, subdivisions: dict[str, list[str]] | None = None) DataFrame[source]#

Determine involvement status of an LNL’s superlevel (e.g., II).

Some LNLs have sublevels, e.g., IIa and IIb. In real data, sometimes the sublevels are reported, sometimes only the superlevel. This function infers the status of the superlevel from the sublevels.

The superlevel’s status is computed for the specified modalities. If and what sublevels a superlevel has, is specified in subdivisions.

The resulting DataFrame will only contain the newly inferred superlevel columns. This way, it is straightforward to update() the original DataFrame.

>>> df = pd.DataFrame({
...     ('MRI', 'ipsi'  , 'Ia' ): [True , False, False, None],
...     ('MRI', 'ipsi'  , 'Ib' ): [False, True , False, None],
...     ('MRI', 'contra', 'IIa'): [False, False, None , None],
...     ('MRI', 'contra', 'IIb'): [False, True , True , None],
...     ('CT' , 'ipsi'  , 'I'  ): [True , False, False, None],
... })
>>> df.ly.infer_superlevels(modalities=["MRI"]) 
     MRI
    ipsi contra
       I     II
0   True  False
1   True   True
2  False   True
3   None   None
lydata.accessor.main() None[source]#

Run main function.

lydata.accessor.run_doctests() None[source]#

Run the module doctests.