Source code for retentioneering.data_processors_lib.label_lost_users
from __future__ import annotations
from typing import List, Optional, Tuple, Union
import numpy as np
import pandas as pd
from retentioneering.backend.tracker import collect_data_performance, time_performance
from retentioneering.common.constants import DATETIME_UNITS
from retentioneering.data_processor import DataProcessor
from retentioneering.eventstream.types import EventstreamSchemaType
from retentioneering.params_model import ParamsModel
from retentioneering.utils.doc_substitution import docstrings
from retentioneering.utils.hash_object import hash_dataframe
from retentioneering.widget.widgets import ListOfIds, ReteTimeWidget
[docs]class LabelLostUsersParams(ParamsModel):
"""
A class with parameters for :py:class:`.LabelLostUsers` class.
"""
timeout: Optional[Tuple[float, DATETIME_UNITS]]
lost_users_list: Optional[Union[List[int], List[str]]]
_widgets = {
"timeout": ReteTimeWidget(),
"lost_users_list": ListOfIds(),
}
[docs]@docstrings.get_sections(base="LabelLostUsers") # type: ignore
class LabelLostUsers(DataProcessor):
"""
Create one of synthetic events in each user's path:
``lost_user`` or ``absent_user``.
Parameters
----------
Only one of parameters could be used at the same time
timeout : Tuple(float, :numpy_link:`DATETIME_UNITS<>`), optional
Threshold value and its unit of measure.
Calculate timedelta between the last event in each user's path and the last event in the whole eventstream.
For users with timedelta greater or equal to selected ``timeout``, a new synthetic event - ``lost_user``
will be added.
For other users paths a new synthetic event - ``absent_user`` will be added.
lost_users_list : list of int or list of str, optional
If the `list of user_ids` is given new synthetic event - ``lost_user`` will be added to each user from the list.
For other user's paths will be added new synthetic event - ``absent_user``.
Returns
-------
Eventstream
``Eventstream`` with new synthetic events only - one for each user:
+-----------------+-----------------+------------------+
| **event_name** | **event_type** | **timestamp** |
+-----------------+-----------------+------------------+
| lost_user | lost_user | last_event |
+-----------------+-----------------+------------------+
| absent_user | absent_user | last_event |
+-----------------+-----------------+------------------+
Raises
------
ValueError
Raised when both ``timeout`` and ``lost_users_list`` are either empty or given.
Notes
-----
See :doc:`Data processors user guide</user_guides/dataprocessors>` for the details.
"""
params: LabelLostUsersParams
@time_performance(
scope="label_lost_users",
event_name="init",
)
def __init__(self, params: LabelLostUsersParams):
super().__init__(params=params)
@time_performance( # type: ignore
scope="label_lost_users",
event_name="apply",
)
def apply(self, df: pd.DataFrame, schema: EventstreamSchemaType) -> pd.DataFrame:
user_col = schema.user_id
time_col = schema.event_timestamp
type_col = schema.event_type
event_col = schema.event_name
timeout, timeout_unit = None, None
lost_users_list = self.params.lost_users_list
data_lost = pd.DataFrame()
if self.params.timeout:
timeout, timeout_unit = self.params.timeout
if timeout and lost_users_list:
raise ValueError("timeout and lost_users_list parameters cannot be used simultaneously!")
if not timeout and not lost_users_list:
raise ValueError("Either timeout or lost_users_list must be specified!")
if timeout and timeout_unit:
data_lost = df.groupby(user_col, as_index=False).last()
data_lost["diff_end_to_end"] = data_lost[time_col].max() - data_lost[time_col]
data_lost["diff_end_to_end"] /= np.timedelta64(1, timeout_unit) # type: ignore
data_lost[type_col] = np.where(data_lost["diff_end_to_end"] < timeout, "absent_user", "lost_user")
data_lost[event_col] = data_lost[type_col]
del data_lost["diff_end_to_end"]
if lost_users_list:
data_lost = df.groupby(user_col, as_index=False).last()
data_lost[type_col] = np.where(data_lost["user_id"].isin(lost_users_list), "lost_user", "absent_user")
data_lost[event_col] = data_lost[type_col]
result = pd.concat([df, data_lost])
collect_data_performance(
scope="label_lost_users",
event_name="metadata",
called_params=self.to_dict()["values"],
not_hash_values=["timeout"],
performance_data={
"parent": {
"shape": df.shape,
"hash": hash_dataframe(df),
},
"child": {
"shape": result.shape,
"hash": hash_dataframe(result),
},
},
)
return result