Source code for retentioneering.tooling.user_lifetime_hist.user_lifetime_hist

from __future__ import annotations

import warnings
from typing import Literal

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from retentioneering.backend.tracker import (
    collect_data_performance,
    time_performance,
    track,
)
from retentioneering.constants import DATETIME_UNITS
from retentioneering.eventstream.types import EventstreamType
from retentioneering.tooling.constants import BINS_ESTIMATORS


[docs]class UserLifetimeHist:
    """

    Plot the distribution of user lifetimes. A ``users lifetime`` is the timedelta between
    the first and the last events of the user.

    Parameters
    ----------
    eventstream : EventstreamType

    See Also
    --------
    .EventTimestampHist : Plot the distribution of events over time.
    .TimedeltaHist : Plot the distribution of the time deltas between two events.
    .Eventstream.describe : Show general eventstream statistics.
    .Eventstream.describe_events : Show general eventstream events statistics.
    .DropPaths : Filter user paths based on the path length, removing the paths that are shorter than the
                               specified number of events or cut_off.

    Notes
    -----
    See :ref:`Eventstream user guide<eventstream_user_lifetime>` for the details.

    """

    __eventstream: EventstreamType
    timedelta_unit: DATETIME_UNITS
    log_scale: bool | tuple[bool, bool] | None
    lower_cutoff_quantile: float | None
    upper_cutoff_quantile: float | None
    bins: int | Literal[BINS_ESTIMATORS]
    bins_to_show: np.ndarray
    values_to_plot: np.ndarray

    @time_performance(
        scope="user_lifetime_hist",
        event_name="init",
    )
    def __init__(self, eventstream: EventstreamType) -> None:
        self.__eventstream = eventstream
        self.user_col = self.__eventstream.schema.user_id
        self.event_col = self.__eventstream.schema.event_name
        self.time_col = self.__eventstream.schema.event_timestamp

        self.bins_to_show = np.array([])
        self.values_to_plot = np.array([])

    def _remove_cutoff_values(self, series: pd.Series) -> pd.Series:
        idx = [True] * len(series)
        if self.upper_cutoff_quantile is not None:
            idx &= series <= series.quantile(self.upper_cutoff_quantile)
        if self.lower_cutoff_quantile is not None:
            idx &= series >= series.quantile(self.lower_cutoff_quantile)
        return series[idx]

    def __validate_input(
        self,
        log_scale: bool | tuple[bool, bool] | None = None,
        lower_cutoff_quantile: float | None = None,
        upper_cutoff_quantile: float | None = None,
    ) -> tuple[tuple[bool, bool], float | None, float | None]:
        if lower_cutoff_quantile is not None:
            if not 0 < lower_cutoff_quantile < 1:
                raise ValueError("lower_cutoff_quantile should be a fraction between 0 and 1.")

        if upper_cutoff_quantile is not None:
            if not 0 < upper_cutoff_quantile < 1:
                raise ValueError("upper_cutoff_quantile should be a fraction between 0 and 1.")

        if lower_cutoff_quantile is not None and upper_cutoff_quantile is not None:
            if lower_cutoff_quantile > upper_cutoff_quantile:
                warnings.warn("lower_cutoff_quantile exceeds upper_cutoff_quantile; no data passed to the histogram")

        if log_scale:
            if isinstance(log_scale, bool):
                log_scale = (log_scale, False)
            else:
                log_scale = log_scale
        else:
            log_scale = (False, False)

        return log_scale, upper_cutoff_quantile, lower_cutoff_quantile

[docs]    @time_performance(
        scope="user_lifetime_hist",
        event_name="fit",
    )
    def fit(
        self,
        timedelta_unit: DATETIME_UNITS = "s",
        log_scale: bool | tuple[bool, bool] | None = None,
        lower_cutoff_quantile: float | None = None,
        upper_cutoff_quantile: float | None = None,
        bins: int | Literal[BINS_ESTIMATORS] = 20,
    ) -> None:
        """
        Calculate values for the histplot.

        Parameters
        ----------
        timedelta_unit : :numpy_link:`DATETIME_UNITS<>`, default 's'
            Specify units of time differences the histogram should use. Use "s" for seconds, "m" for minutes,
            "h" for hours and "D" for days.
        log_scale : bool or tuple of bool, optional

            - If ``True`` - apply log scaling to the ``x`` axis.
            - If tuple of bool - apply log scaling to the (``x``,``y``) axes correspondingly.
        lower_cutoff_quantile : float, optional
            Specify time distance quantile as the lower boundary. The values below the boundary are truncated.
        upper_cutoff_quantile : float, optional
            Specify time distance quantile as the upper boundary. The values above the boundary are truncated.
        bins : int or str, default 20
            Generic bin parameter that can be the name of a reference rule or
            the number of bins. Passed to :numpy_bins_link:`numpy.histogram_bin_edges<>`.

        Returns
        -------
        None
        """
        called_params = {
            "timedelta_unit": timedelta_unit,
            "log_scale": log_scale,
            "lower_cutoff_quantile": lower_cutoff_quantile,
            "upper_cutoff_quantile": upper_cutoff_quantile,
            "bins": bins,
        }
        not_hash_values = ["timedelta_unit"]

        self.log_scale, self.upper_cutoff_quantile, self.lower_cutoff_quantile = self.__validate_input(
            log_scale,
            lower_cutoff_quantile,
            upper_cutoff_quantile,
        )

        self.timedelta_unit = timedelta_unit
        self.bins = bins

        data = self.__eventstream.to_dataframe(copy=True).groupby(self.user_col)[self.time_col].agg(["min", "max"])
        data["time_passed"] = data["max"] - data["min"]
        values_to_plot = (data["time_passed"] / np.timedelta64(1, self.timedelta_unit)).reset_index(  # type: ignore
            drop=True
        )

        if self._remove_cutoff_values:  # type: ignore
            values_to_plot = self._remove_cutoff_values(values_to_plot).to_numpy()
        if self.log_scale[0]:
            log_adjustment = np.timedelta64(100, "ms") / np.timedelta64(1, self.timedelta_unit)
            values_to_plot = np.where(values_to_plot != 0, values_to_plot, values_to_plot + log_adjustment)  # type: ignore
            bins_to_show = np.power(10, np.histogram_bin_edges(np.log10(values_to_plot), bins=self.bins))
        else:
            bins_to_show = np.histogram_bin_edges(values_to_plot, bins=self.bins)
        if len(values_to_plot) == 0:
            bins_to_show = np.array([])

        self.bins_to_show = bins_to_show
        self.values_to_plot = values_to_plot  # type: ignore
        collect_data_performance(
            scope="user_lifetime_hist",
            event_name="metadata",
            called_params=called_params,
            not_hash_values=not_hash_values,
            performance_data={},
            eventstream_index=self.__eventstream._eventstream_index,
        )

    @property
    @time_performance(
        scope="user_lifetime_hist",
        event_name="values",
    )
    def values(self) -> tuple[np.ndarray, np.ndarray]:
        """

        Returns
        -------
        tuple(np.ndarray, np.ndarray)
            1. The first array contains the values for histogram.
            2. The first array contains the bin edges.

        """
        return self.values_to_plot, self.bins_to_show

[docs]    @time_performance(
        scope="user_lifetime_hist",
        event_name="plot",
    )
    def plot(self, width: float = 6.0, height: float = 4.5) -> matplotlib.axes.Axes:
        """
        Create a sns.histplot based on the calculated values.

        Parameters
        ----------
        width : float, default 6.0
            Width in inches.
        height : float, default 4.5
            Height in inches.

        Returns
        -------
        :matplotlib_axes:`matplotlib.axes.Axes<>`
            The matplotlib axes containing the plot.

        """
        figsize = (width, height)
        plt.subplots(figsize=figsize)

        hist = sns.histplot(self.values_to_plot, bins=self.bins, log_scale=self.log_scale)
        hist.set_title("User lifetime histogram")
        hist.set_xlabel(f"Time units: {self.timedelta_unit}")

        collect_data_performance(
            scope="event_timestamp_hist",
            event_name="metadata",
            called_params={"width": width, "height": height},
            performance_data={},
            eventstream_index=self.__eventstream._eventstream_index,
        )

        return hist