Custom Pandas Accessor#

Module containing a custom accessor and helpers for querying lydata.

lydata.accessor.get_all_true(df: DataFrame) Series[source]#

Return a mask with all entries set to True.

class lydata.accessor.CombineQMixin[source]#

Mixin class for combining queries.

class lydata.accessor.Q(column: str, operator: Literal['==', '<', '<=', '>', '>=', '!=', 'in'], value: Any)[source]#

Combinable query object for filtering a DataFrame.

execute(df: DataFrame) Series[source]#

Return a boolean mask where the query is satisfied for df.

class lydata.accessor.AndQ(q1: Q | AndQ | OrQ | NotQ, q2: Q | AndQ | OrQ | NotQ)[source]#

Query object for combining two queries with a logical AND.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q1 = Q('col1', '>', 1)
>>> q2 = Q('col1', '<', 3)
>>> and_q = q1 & q2
>>> print(and_q)
Q('col1', '>', 1) & Q('col1', '<', 3)
>>> isinstance(and_q, AndQ)
True
>>> and_q.execute(df)
0    False
1     True
2    False
Name: col1, dtype: bool
execute(df: DataFrame) Series[source]#

Return a boolean mask where both queries are satisfied.

class lydata.accessor.OrQ(q1: Q | AndQ | OrQ | NotQ, q2: Q | AndQ | OrQ | NotQ)[source]#

Query object for combining two queries with a logical OR.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q1 = Q('col1', '==', 1)
>>> q2 = Q('col1', '==', 3)
>>> or_q = q1 | q2
>>> print(or_q)
Q('col1', '==', 1) | Q('col1', '==', 3)
>>> isinstance(or_q, OrQ)
True
>>> or_q.execute(df)
0     True
1    False
2     True
Name: col1, dtype: bool
execute(df: DataFrame) Series[source]#

Return a boolean mask where either query is satisfied.

class lydata.accessor.NotQ(q: Q | AndQ | OrQ | NotQ)[source]#

Query object for negating a query.

>>> df = pd.DataFrame({'col1': [1, 2, 3]})
>>> q = Q('col1', '==', 2)
>>> not_q = ~q
>>> print(not_q)
~Q('col1', '==', 2)
>>> isinstance(not_q, NotQ)
True
>>> not_q.execute(df)
0     True
1    False
2     True
Name: col1, dtype: bool
execute(df: DataFrame) Series[source]#

Return a boolean mask where the query is not satisfied.

class lydata.accessor.NoneQ[source]#

Query object that always returns the entire DataFrame. Useful as default.

execute(df: DataFrame) Series[source]#

Return a boolean mask with all entries set to True.

class lydata.accessor.QueryPortion(match: int, total: int)[source]#

Dataclass for storing the portion of a query.

property fail: int#

Get the number of failures.

>>> QueryPortion(2, 5).fail
3
property ratio: float#

Get the ratio of matches over the total.

>>> QueryPortion(2, 5).ratio
0.4
lydata.accessor.align_diagnoses(dataset: DataFrame, modalities: list[str]) list[DataFrame][source]#

Align columns of specified modalities in dataset.

lydata.accessor.create_raising_func(method: str)[source]#

Raise ValueError for wrong method.

lydata.accessor.false_estimate(obs: ndarray, false_pos_probs: ndarray, true_neg_probs: ndarray, method: Literal['prod', 'max']) float[source]#

Compute estimate of False, given obs.

>>> false_estimate([True, False], [0.1, 0.6], [0.4, 0.7], method="whatever")
Traceback (most recent call last):
    ...
ValueError: Unknown method whatever
lydata.accessor.true_estimate(obs: ndarray, true_pos_probs: ndarray, false_neg_probs: ndarray, method: Literal['prod', 'max']) float[source]#

Compute estimate of True, given obs.

>>> obs = [True, False, np.nan]
>>> true_pos_probs = [0.8, 0.6, 0.9]
>>> false_neg_probs = [0.6, 0.7, 0.9]
>>> true_estimate(obs, true_pos_probs, false_neg_probs, method="max")
np.float64(0.8)
>>> tmp = true_estimate(obs, true_pos_probs, false_neg_probs, method="prod")
>>> np.isclose(tmp, 0.56)
np.True_
lydata.accessor.max_likelihood(obs: ndarray, specificities: ndarray, sensitivities: ndarray) bool[source]#

Compute most likely true state based on all obs.

>>> obs = np.array([True, False, np.nan, None])
>>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
>>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
>>> max_likelihood(obs, sensitivities, specificities)
np.True_
>>> obs = np.array([True, False, False, False])
>>> max_likelihood(obs, sensitivities, specificities)
np.False_
lydata.accessor.rank_trustworthy(obs: ndarray, specificities: ndarray, sensitivities: ndarray) bool[source]#

Estimate true state based on most trustworthy value in obs.

>>> obs = np.array([True, False, np.nan, None])
>>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7])
>>> specificities = np.array([0.9, 0.7, 0.7, 0.7])
>>> rank_trustworthy(obs, sensitivities, specificities)
np.True_
>>> obs = np.array([True, False, False, False])
>>> rank_trustworthy(obs, sensitivities, specificities)
np.True_
lydata.accessor.expand_mapping(short_map: dict[str, Any], colname_map: dict[str | tuple[str, str, str], Any] | None = None) dict[tuple[str, str, str], Any][source]#

Expand the column map to full column names.

>>> expand_mapping({'age': 'foo', 'hpv': 'bar'})
{('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'}
class lydata.accessor.LydataAccessor(obj: DataFrame)[source]#

Custom accessor for handling lymphatic involvement data.

validate(modalities: list[str] | None = None) DataFrame[source]#

Validate the DataFrame against the lydata schema.

query(query: Q | AndQ | OrQ | NotQ | None = None) DataFrame[source]#

Return a DataFrame with rows that satisfy the query.

portion(query: Q | AndQ | OrQ | NotQ | None = None, given: Q | AndQ | OrQ | NotQ | None = None) QueryPortion[source]#

Compute how many rows satisfy a query, given some other conditions.

Returns a tuple with the number of matches and the number of total rows, such that the ratio of the two is the portion of interest.

>>> df = pd.DataFrame({'x': [1, 2, 3]})
>>> df.lydata.portion(query=Q('x', '==', 2), given=Q('x', '>', 1))
QueryPortion(match=np.int64(1), total=np.int64(2))
>>> df.lydata.portion(query=Q('x', '==', 2), given=Q('x', '>', 3))
QueryPortion(match=np.int64(0), total=np.int64(0))
stats(agg_funcs: dict[str | tuple[str, str, str], Callable[[Series], Series]] | None = None, use_shortnames: bool = True, out_format: str = 'dict') Any[source]#

Compute statistics.

>>> df = pd.DataFrame({
...     ('patient', '#', 'age'): [61, 52, 73, 61],
...     ('patient', '#', 'hpv_status'): [True, False, None, True],
...     ('tumor', '1', 't_stage'): [2, 3, 1, 2],
... })
>>> df.lydata.stats()   
{'age': {61: 2, 52: 1, 73: 1},
 'hpv': {True: 2, False: 1, None: 1},
 't_stage': {2: 2, 3: 1, 1: 1}}
combine(modalities: list[Modality] | None = None, method: Literal['max_llh', 'rank'] = 'max_llh') DataFrame[source]#

Combine diagnoses of modalities using method.

lydata.accessor.main() None[source]#

Run main function.