Custom Pandas Accessor#
Module containing a custom accessor and helpers for querying lydata.
- lydata.accessor.get_all_true(df: DataFrame) Series[source]#
Return a mask with all entries set to
True.
- class lydata.accessor.Q(column: str, operator: Literal['==', '<', '<=', '>', '>=', '!=', 'in'], value: Any)[source]#
Combinable query object for filtering a DataFrame.
- class lydata.accessor.AndQ(q1: Q | AndQ | OrQ | NotQ, q2: Q | AndQ | OrQ | NotQ)[source]#
Query object for combining two queries with a logical AND.
>>> df = pd.DataFrame({'col1': [1, 2, 3]}) >>> q1 = Q('col1', '>', 1) >>> q2 = Q('col1', '<', 3) >>> and_q = q1 & q2 >>> print(and_q) Q('col1', '>', 1) & Q('col1', '<', 3) >>> isinstance(and_q, AndQ) True >>> and_q.execute(df) 0 False 1 True 2 False Name: col1, dtype: bool
- class lydata.accessor.OrQ(q1: Q | AndQ | OrQ | NotQ, q2: Q | AndQ | OrQ | NotQ)[source]#
Query object for combining two queries with a logical OR.
>>> df = pd.DataFrame({'col1': [1, 2, 3]}) >>> q1 = Q('col1', '==', 1) >>> q2 = Q('col1', '==', 3) >>> or_q = q1 | q2 >>> print(or_q) Q('col1', '==', 1) | Q('col1', '==', 3) >>> isinstance(or_q, OrQ) True >>> or_q.execute(df) 0 True 1 False 2 True Name: col1, dtype: bool
- class lydata.accessor.NotQ(q: Q | AndQ | OrQ | NotQ)[source]#
Query object for negating a query.
>>> df = pd.DataFrame({'col1': [1, 2, 3]}) >>> q = Q('col1', '==', 2) >>> not_q = ~q >>> print(not_q) ~Q('col1', '==', 2) >>> isinstance(not_q, NotQ) True >>> not_q.execute(df) 0 True 1 False 2 True Name: col1, dtype: bool
- class lydata.accessor.NoneQ[source]#
Query object that always returns the entire DataFrame. Useful as default.
- class lydata.accessor.QueryPortion(match: int, total: int)[source]#
Dataclass for storing the portion of a query.
- lydata.accessor.align_diagnoses(dataset: DataFrame, modalities: list[str]) list[DataFrame][source]#
Align columns of specified modalities in
dataset.
- lydata.accessor.false_estimate(obs: ndarray, false_pos_probs: ndarray, true_neg_probs: ndarray, method: Literal['prod', 'max']) float[source]#
Compute estimate of
False, givenobs.>>> false_estimate([True, False], [0.1, 0.6], [0.4, 0.7], method="whatever") Traceback (most recent call last): ... ValueError: Unknown method whatever
- lydata.accessor.true_estimate(obs: ndarray, true_pos_probs: ndarray, false_neg_probs: ndarray, method: Literal['prod', 'max']) float[source]#
Compute estimate of
True, givenobs.>>> obs = [True, False, np.nan] >>> true_pos_probs = [0.8, 0.6, 0.9] >>> false_neg_probs = [0.6, 0.7, 0.9] >>> true_estimate(obs, true_pos_probs, false_neg_probs, method="max") np.float64(0.8) >>> tmp = true_estimate(obs, true_pos_probs, false_neg_probs, method="prod") >>> np.isclose(tmp, 0.56) np.True_
- lydata.accessor.max_likelihood(obs: ndarray, specificities: ndarray, sensitivities: ndarray) bool[source]#
Compute most likely true state based on all
obs.>>> obs = np.array([True, False, np.nan, None]) >>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7]) >>> specificities = np.array([0.9, 0.7, 0.7, 0.7]) >>> max_likelihood(obs, sensitivities, specificities) np.True_ >>> obs = np.array([True, False, False, False]) >>> max_likelihood(obs, sensitivities, specificities) np.False_
- lydata.accessor.rank_trustworthy(obs: ndarray, specificities: ndarray, sensitivities: ndarray) bool[source]#
Estimate true state based on most trustworthy value in
obs.>>> obs = np.array([True, False, np.nan, None]) >>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7]) >>> specificities = np.array([0.9, 0.7, 0.7, 0.7]) >>> rank_trustworthy(obs, sensitivities, specificities) np.True_ >>> obs = np.array([True, False, False, False]) >>> rank_trustworthy(obs, sensitivities, specificities) np.True_
- lydata.accessor.expand_mapping(short_map: dict[str, Any], colname_map: dict[str | tuple[str, str, str], Any] | None = None) dict[tuple[str, str, str], Any][source]#
Expand the column map to full column names.
>>> expand_mapping({'age': 'foo', 'hpv': 'bar'}) {('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'}
- class lydata.accessor.LydataAccessor(obj: DataFrame)[source]#
Custom accessor for handling lymphatic involvement data.
- validate(modalities: list[str] | None = None) DataFrame[source]#
Validate the DataFrame against the lydata schema.
- query(query: Q | AndQ | OrQ | NotQ | None = None) DataFrame[source]#
Return a DataFrame with rows that satisfy the
query.
- portion(query: Q | AndQ | OrQ | NotQ | None = None, given: Q | AndQ | OrQ | NotQ | None = None) QueryPortion[source]#
Compute how many rows satisfy a
query,givensome other conditions.Returns a tuple with the number of matches and the number of total rows, such that the ratio of the two is the portion of interest.
>>> df = pd.DataFrame({'x': [1, 2, 3]}) >>> df.lydata.portion(query=Q('x', '==', 2), given=Q('x', '>', 1)) QueryPortion(match=np.int64(1), total=np.int64(2)) >>> df.lydata.portion(query=Q('x', '==', 2), given=Q('x', '>', 3)) QueryPortion(match=np.int64(0), total=np.int64(0))
- stats(agg_funcs: dict[str | tuple[str, str, str], Callable[[Series], Series]] | None = None, use_shortnames: bool = True, out_format: str = 'dict') Any[source]#
Compute statistics.
>>> df = pd.DataFrame({ ... ('patient', '#', 'age'): [61, 52, 73, 61], ... ('patient', '#', 'hpv_status'): [True, False, None, True], ... ('tumor', '1', 't_stage'): [2, 3, 1, 2], ... }) >>> df.lydata.stats() {'age': {61: 2, 52: 1, 73: 1}, 'hpv': {True: 2, False: 1, None: 1}, 't_stage': {2: 2, 3: 1, 1: 1}}