Source code for retentioneering.tooling.step_sankey.step_sankey

from __future__ import annotations

from typing import Any, Dict

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns

from retentioneering.backend.tracker import (
    collect_data_performance,
    time_performance,
    track,
    tracker,
)
from retentioneering.eventstream.types import EventstreamType
from retentioneering.tooling.mixins.ended_events import EndedEventsMixin


[docs]class StepSankey(EndedEventsMixin):
    """
    A class for the visualization of user paths in stepwise manner using Sankey diagram.

    Parameters
    ----------
    eventstream : EventstreamType


    See Also
    --------
    .Eventstream.step_sankey : Call StepSankey tool as an eventstream method.
    .CollapseLoops : Find loops and create new synthetic events in the paths of all users having such sequences.
    .StepMatrix : This class provides methods for step matrix calculation and visualization.

    Notes
    -----
    See :doc:`StepSankey user guide</user_guides/step_sankey>` for the details.

    """

    max_steps: int
    threshold: int | float
    sorting: list | None
    targets: list[str] | str | None

    data_grp_nodes: pd.DataFrame
    data: pd.DataFrame
    data_grp_links: pd.DataFrame
    data_for_plot: dict

    @time_performance(
        scope="step_sankey",
        event_name="init",
    )
    def __init__(self, eventstream: EventstreamType) -> None:
        self.__eventstream = eventstream
        self.user_col = self.__eventstream.schema.user_id
        self.event_col = self.__eventstream.schema.event_name
        self.time_col = self.__eventstream.schema.event_timestamp
        self.event_index_col = self.__eventstream.schema.event_index

        self.data_grp_nodes = pd.DataFrame()
        self.data = pd.DataFrame()
        self.data_grp_links = pd.DataFrame()
        self.data_for_plot = {}

    @staticmethod
    def _make_color(
        event: str,
        all_events: list,
        palette: list,
    ) -> tuple[int, int, int]:
        """
        It is a color picking function

        Parameters
        ----------
        event : str
            An event for color setting
        all_events : list
            A list of all events
        palette : list
            A list of colors

        Returns
        -------
        str
            A picked color for certain event
        """

        return palette[list(all_events).index(event)]

    @staticmethod
    def _round_up(
        n: float,
        dec: float,
    ) -> float:
        """
        Rounds the value up to the nearest value assuming a grid with ``dec`` step.
        E.g. ``_round_up(0.51, 0.05) = 0.55``, ``_round_up(0.55, 0.05) = 0.6``

        Parameters
        ----------
        n : float
            A number to round up
        dec : float
            A decimal for correct rounding up

        Returns
        -------
        float
            Rounded value
        """

        return round(n - n % dec + dec, 2)

    def _get_nodes_positions(self, df: pd.DataFrame) -> tuple[list[float], list[float]]:
        """
        It is a function for placing nodes at the x and y coordinates of plotly lib plot canvas.

        Parameters
        ----------
        df : pandas Dataframe
            A dataframe that contains aggregated information about the nodes.

        Returns
        -------
        tuple[list[float], list[float]]
            Two lists with the corresponding coordinates x and y.
        """
        # NOTE get x axis length
        x_len = len(df["step"].unique())

        # NOTE declare positions
        x_positions = []
        y_positions = []
        # NOTE get maximum range for placing middle points
        y_range = 0.95 - 0.05

        # NOTE going inside ranked events
        for step in sorted(df["step"].unique()):
            # NOTE placing x-axis points as well
            for _ in df[df["step"] == step][self.event_col]:
                x_positions.append([round(x, 2) for x in np.linspace(0.05, 0.95, x_len)][step - 1])

            # NOTE it always works very well if you have less than 4 values at current rank
            y_len = len(df[df["step"] == step][self.event_col])

            # NOTE at this case using came positions as x-axis because we don't need to calculate something more
            if y_len < 4:
                for p in [round(y, 2) for y in np.linspace(0.05, 0.95, y_len)]:
                    y_positions.append(p)

            # NOTE jumping in to complex part
            else:
                # NOTE total sum for understanding do we need extra step size or not
                total_sum = df[df["step"] == step]["usr_cnt"].sum()
                # NOTE step size for middle points
                y_step = round(y_range / total_sum, 2)
                # NOTE cumulative sum for understanding do we need use default step size or not
                cumulative_sum = 0
                # NOTE ENDED action
                ended_sum = df[(df["step"] == step) & (df[self.event_col] == "ENDED")]["usr_cnt"].sum()
                last_point = self._round_up(ended_sum / total_sum, 0.05)

                iterate_sum = 0

                # NOTE going deeper inside each event
                for n, event in enumerate(df[df["step"] == step][self.event_col]):
                    # NOTE placing first event at first possible position
                    if n == 0:
                        y_positions.append(0.05)

                    # NOTE placing last event at last possible position
                    elif n + 1 == y_len:
                        y_positions.append(0.95)

                    # NOTE placing middle points
                    else:
                        # NOTE we found out that 70% of total sum is the best cap for doing this case
                        if iterate_sum / total_sum > 0.2 and event != "ENDED":
                            # NOTE placing first point after the biggest one at the next position
                            # but inside [.1; .3] range
                            y_positions.append(
                                round(y_positions[-1] + np.minimum(np.maximum(y_step * iterate_sum, 0.1), 0.3), 2)
                            )

                        # NOTE placing points after the biggest
                        else:
                            # NOTE placing little points at the all available space
                            y_positions.append(
                                round(y_positions[-1] + (0.95 - last_point - y_positions[-1]) / (y_len - n), 2)
                            )

                    # NOTE set sum for next step
                    iterate_sum = df[(df["step"] == step) & (df[self.event_col] == event)]["usr_cnt"].to_numpy()[0]
                    # NOTE update cumulative sum
                    cumulative_sum += iterate_sum

        return x_positions, y_positions

    def _pad_end_events(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        If the number of events in a user's path is less than self.max_steps, then the function pads the path with
        multiply ENDED events. It is required for correct visualization of the trajectories which are
        shorter than self.max_steps.
        """
        pad = (
            data.groupby(self.user_col, as_index=False)[self.event_col]
            .count()
            .loc[lambda df_: df_[self.event_col] < self.max_steps]  # type: ignore
            .assign(repeat_number=lambda df_: self.max_steps - df_[self.event_col])
        )
        repeats = pd.DataFrame({self.user_col: np.repeat(pad[self.user_col], pad["repeat_number"])})
        padded_end_events = pd.merge(repeats, data[data[self.event_col] == "ENDED"], on=self.user_col)
        result = pd.concat([data, padded_end_events]).sort_values([self.user_col, self.event_index_col])
        return result

    def _prepare_data(self, data: pd.DataFrame) -> pd.DataFrame:
        data = self._add_ended_events(
            data=data, schema=self.__eventstream.schema, weight_col=self.__eventstream.schema.user_id
        )
        data = self._pad_end_events(data)
        # NOTE set new columns using declared functions
        data[self.time_col] = pd.to_datetime(data[self.time_col])
        data["step"] = data.groupby(self.user_col)[self.event_index_col].rank(method="first").astype(int)
        data = data.sort_values(by=["step", self.time_col]).reset_index(drop=True)
        data = self._get_next_event_and_timedelta(data)

        # NOTE threshold
        data["event_users"] = data.groupby(by=["step", self.event_col])[self.user_col].transform("nunique")
        data["total_users"] = data.loc[data["step"] == 1, self.user_col].nunique()
        data["perc"] = data["event_users"] / data["total_users"]

        if isinstance(self.threshold, float):
            column_to_compare = "perc"
        else:
            # assume that self.threshold must be of int type here
            column_to_compare = "event_users"

        events_to_keep = ["ENDED"]
        if self.targets is not None:
            events_to_keep += self.targets

        threshold_events = (
            data.loc[data["step"] <= self.max_steps, :]
            .groupby(by=self.event_col, as_index=False)[column_to_compare]
            .max()
            .loc[
                lambda df_: (df_[column_to_compare] <= self.threshold) & (~df_[self.event_col].isin(events_to_keep))
            ]  # type: ignore
            .loc[:, self.event_col]
        )
        data.loc[data[self.event_col].isin(threshold_events), self.event_col] = f"thresholded_{len(threshold_events)}"

        # NOTE rearrange the data taking into account recently added thresholded events
        data["step"] = data.groupby(self.user_col)[self.event_index_col].rank(method="first").astype(int)
        data = self._get_next_event_and_timedelta(data)

        # NOTE use max_steps for filtering data
        data = data.loc[data["step"] <= self.max_steps, :]

        # TODO: Do we really need to replace NA values?
        # NOTE skip mean calculating error
        data["time_to_next"] = data["time_to_next"].fillna(data["time_to_next"].min())
        return data

    def _render_plot(
        self,
        data_for_plot: dict,
        data_grp_nodes: pd.DataFrame,
        autosize: bool = True,
        width: int | None = None,
        height: int | None = None,
    ) -> go.Figure:
        # NOTE fill lists for plot
        targets = []
        sources = []
        values = []
        time_to_next = []
        for source_key in data_for_plot["links_dict"].keys():
            for target_key, target_value in data_for_plot["links_dict"][source_key].items():
                sources.append(source_key)
                targets.append(target_key)
                values.append(target_value["unique_users"])
                time_to_next.append(
                    str(pd.to_timedelta(target_value["avg_time_to_next"] / target_value["unique_users"])).split(".")[0]
                )
        # NOTE fill another lists for plot
        labels = []
        colors = []
        percs = []
        for key in data_for_plot["nodes_dict"].keys():
            labels += list(data_for_plot["nodes_dict"][key]["sources"])
            colors += list(data_for_plot["nodes_dict"][key]["color"])
            percs += list(data_for_plot["nodes_dict"][key]["percs"])
        # NOTE get colors for plot
        for idx, color in enumerate(colors):
            colors[idx] = "rgb" + str(color) + ""
        # NOTE get positions for plot
        x, y = self._get_nodes_positions(df=data_grp_nodes)
        # NOTE make plot
        fig = go.Figure(
            data=[
                go.Sankey(
                    arrangement="snap",
                    node=dict(
                        thickness=15,
                        line=dict(color="black", width=0.5),
                        label=labels,
                        color=colors,
                        customdata=percs,
                        hovertemplate="Total unique users: %{value} (%{customdata}% of total)<extra></extra>",
                        x=x,
                        y=y,
                        pad=20,
                    ),
                    link=dict(
                        source=sources,
                        target=targets,
                        value=values,
                        label=time_to_next,
                        hovertemplate="%{value} unique users went from %{source.label} to %{target.label}.<br />"
                        + "<br />It took them %{label} in average.<extra></extra>",
                    ),
                )
            ]
        )
        fig.update_layout(font=dict(size=15), plot_bgcolor="white", autosize=autosize, width=width, height=height)

        return fig

    def _get_links(
        self, data: pd.DataFrame, data_for_plot: dict, data_grp_nodes: pd.DataFrame
    ) -> tuple[dict, pd.DataFrame]:
        # NOTE create links aggregated dataframe
        data_grp_links = (
            data[data["step"] <= self.max_steps - 1]
            .groupby(by=["step", self.event_col, "next_event"])[[self.user_col, "time_to_next"]]
            .agg({self.user_col: ["count"], "time_to_next": ["sum"]})
            .reset_index()
            .rename(columns={self.user_col: "usr_cnt", "time_to_next": "time_to_next_sum"})
        )
        data_grp_links.columns = data_grp_links.columns.droplevel(1)
        data_grp_links = data_grp_links.merge(
            data_grp_nodes[["step", self.event_col, "index"]],
            how="inner",
            on=["step", self.event_col],
        )
        data_grp_links.loc[:, "next_step"] = data_grp_links["step"] + 1
        data_grp_links = data_grp_links.merge(
            data_grp_nodes[["step", self.event_col, "index"]].rename(
                columns={"step": "next_step", self.event_col: "next_event", "index": "next_index"}
            ),
            how="inner",
            on=["next_step", "next_event"],
        )

        data_grp_links.sort_values(by=["index", "usr_cnt"], ascending=[True, False], inplace=True)
        data_grp_links.reset_index(drop=True, inplace=True)

        # NOTE generating links plot dict
        data_for_plot.update({"links_dict": dict()})
        for index in data_grp_links["index"].unique():
            for next_index in data_grp_links[data_grp_links["index"] == index]["next_index"].unique():
                _unique_users, _avg_time_to_next = (
                    data_grp_links.loc[
                        (data_grp_links["index"] == index) & (data_grp_links["next_index"] == next_index),
                        ["usr_cnt", "time_to_next_sum"],
                    ]
                    .to_numpy()
                    .T
                )

                if index in data_for_plot["links_dict"]:
                    if next_index in data_for_plot["links_dict"][index]:
                        data_for_plot["links_dict"][index][next_index]["unique_users"] = _unique_users[0]
                        data_for_plot["links_dict"][index][next_index]["avg_time_to_next"] = np.timedelta64(
                            _avg_time_to_next[0]
                        )
                    else:
                        data_for_plot["links_dict"][index].update(
                            {
                                next_index: {
                                    "unique_users": _unique_users[0],
                                    "avg_time_to_next": np.timedelta64(_avg_time_to_next[0]),
                                }
                            }
                        )
                else:
                    data_for_plot["links_dict"].update(
                        {
                            index: {
                                next_index: {
                                    "unique_users": _unique_users[0],
                                    "avg_time_to_next": np.timedelta64(_avg_time_to_next[0]),
                                }
                            }
                        }
                    )
        return data_for_plot, data_grp_links

    def _get_nodes(self, data: pd.DataFrame) -> tuple[dict, pd.DataFrame]:
        all_events = list(data[self.event_col].unique())
        palette = self._prepare_palette(all_events)

        # NOTE create nodes aggregate dataframe
        data_grp_nodes = (
            data.groupby(by=["step", self.event_col])[self.user_col]
            .nunique()
            .reset_index()
            .rename(columns={self.user_col: "usr_cnt"})
        )
        data_grp_nodes.loc[:, "usr_cnt_total"] = data_grp_nodes.groupby(by=["step"])["usr_cnt"].transform("sum")
        data_grp_nodes.loc[:, "perc"] = np.round(
            (data_grp_nodes.loc[:, "usr_cnt"] / data_grp_nodes.loc[:, "usr_cnt_total"]) * 100, 2
        )
        data_grp_nodes.sort_values(
            by=["step", "usr_cnt", self.event_col],
            ascending=[True, False, True],
            inplace=True,
        )
        data_grp_nodes.reset_index(
            drop=True,
            inplace=True,
        )
        data_grp_nodes.loc[:, "color"] = data_grp_nodes[self.event_col].apply(
            lambda x: self._make_color(x, all_events, palette)
        )
        data_grp_nodes.loc[:, "index"] = data_grp_nodes.index  # type: ignore
        # NOTE doing right ranking
        if self.sorting is None:
            data_grp_nodes.loc[:, "sorting"] = 100
        else:
            for n, s in enumerate(self.sorting):
                data_grp_nodes.loc[data_grp_nodes[self.event_col] == s, "sorting"] = n
            data_grp_nodes.loc[:, "sorting"].fillna(100, inplace=True)
        # NOTE placing ENDED at the end
        data_grp_nodes.loc[data_grp_nodes[self.event_col] == "ENDED", "sorting"] = 101
        # NOTE using custom ordering
        data_grp_nodes.loc[:, "sorting"] = data_grp_nodes.loc[:, "sorting"].astype(int)

        # @TODO: step variable is not used inside the loop. The loop might be invalid. Vladimir Kukushkin
        # NOTE doing loop for valid ranking
        for step in data_grp_nodes["step"].unique():
            # NOTE saving last level order
            data_grp_nodes.loc[:, "order_by"] = (
                data_grp_nodes.groupby(by=[self.event_col])["index"].transform("shift").fillna(100).astype(int)
            )

            # NOTE placing ENDED events at the end
            data_grp_nodes.loc[data_grp_nodes[self.event_col] == "ENDED", "sorting"] = 101

            # NOTE creating new indexes
            data_grp_nodes.sort_values(
                by=["step", "sorting", "order_by", "usr_cnt", self.event_col],
                ascending=[True, True, True, False, True],
                inplace=True,
            )

            data_grp_nodes.reset_index(
                drop=True,
                inplace=True,
            )

            data_grp_nodes.loc[:, "index"] = data_grp_nodes.index  # type: ignore

        # NOTE generating nodes plot dict
        data_for_plot: Dict[str, Any] = dict()
        data_for_plot.update({"nodes_dict": dict()})
        for step in data_grp_nodes["step"].unique():
            data_for_plot["nodes_dict"].update({step: dict()})
            _sources, _color, _sources_index, _percs = (
                data_grp_nodes.loc[data_grp_nodes["step"] == step, [self.event_col, "color", "index", "perc"]]
                .to_numpy()
                .T
            )

            data_for_plot["nodes_dict"][step].update(
                {
                    "sources": list(_sources),
                    "color": list(_color),
                    "sources_index": list(_sources_index),
                    "percs": list(_percs),
                }
            )

        return data_for_plot, data_grp_nodes

    @staticmethod
    def _prepare_palette(all_events: list) -> list[tuple]:
        # NOTE default color palette
        palette_hex = ["50BE97", "E4655C", "FCC865", "BFD6DE", "3E5066", "353A3E", "E6E6E6"]
        # NOTE convert HEX to RGB
        palette = []
        for color in palette_hex:
            rgb_color = tuple(int(color[i : i + 2], 16) for i in (0, 2, 4))
            palette.append(rgb_color)

        # NOTE extend color palette if number of events more than default colors list
        complementary_palette = sns.color_palette("deep", len(all_events) - len(palette))
        if len(complementary_palette) > 0:
            colors = complementary_palette.as_hex()
            for c in colors:
                col = c[1:]
                palette.append(tuple(int(col[i : i + 2], 16) for i in (0, 2, 4)))

        return palette

    def _get_next_event_and_timedelta(self, data: pd.DataFrame) -> pd.DataFrame:
        grouped = data.groupby(self.user_col)
        data["next_event"] = grouped[self.event_col].shift(-1)
        data["next_timestamp"] = grouped[self.time_col].shift(-1)
        data["time_to_next"] = data["next_timestamp"] - data[self.time_col]
        data = data.drop("next_timestamp", axis=1)
        return data

[docs]    @time_performance(scope="step_sankey", event_name="fit")
    def fit(
        self,
        max_steps: int = 10,
        threshold: int | float = 0.05,
        sorting: list | None = None,
        targets: list[str] | str | None = None,
    ) -> None:
        """
        Calculate the sankey diagram internal values with the defined parameters.
        Applying ``fit`` method is necessary for the following usage
        of any visualization or descriptive ``StepSankey`` methods.

        Parameters
        ----------
        max_steps : int, default 10
            Maximum number of steps in trajectories to include. Should be > 1.
        threshold : float | int, default 0.05
            Used to remove rare events from the plot. An event is collapsed to ``thresholded_N`` artificial event if
            its maximum frequency across all the steps is less than or equal to ``threshold``. The frequency is set
            with respect to ``threshold`` type:

            - If ``int`` - the frequency is the number of unique users who had given event at given step.
            - If ``float`` - percentage of users: the same as for ``int``, but divided by the number of unique users.

            The events which are prohibited for collapsing could be enlisted in ``target`` parameter.
        sorting : list of str, optional
            Define the order of the events visualized at each step. The events that are not represented in the list
            will follow after the events from the list.
        targets : str or list of str, optional
            Contain events that are prohibited for collapsing with ``threshold`` parameter.

        Raises
        ------
        ValueError
            If ``max_steps`` parameter is <= 1.
        """
        data = self.__eventstream.to_dataframe(copy=True)[
            [self.user_col, self.event_col, self.time_col, self.event_index_col]
        ]
        if max_steps <= 1:
            raise ValueError("max_steps parameter must be > 1!")
        called_params = {
            "max_steps": max_steps,
            "threshold": threshold,
            "sorting": sorting,
            "targets": targets,
        }

        self.max_steps = max_steps
        self.threshold = threshold
        self.sorting = sorting
        self.targets = targets

        self.data = self._prepare_data(data)
        data_for_plot, self.data_grp_nodes = self._get_nodes(self.data)
        self.data_for_plot, self.data_grp_links = self._get_links(self.data, data_for_plot, self.data_grp_nodes)
        performance_info = {
            "nodes_count": len(self.data_grp_nodes),
            "links_count": len(self.data_grp_links),
        }
        collect_data_performance(
            scope="step_sankey",
            event_name="metadata",
            called_params=called_params,
            performance_data=performance_info,
            eventstream_index=self.__eventstream._eventstream_index,
        )

[docs]    @time_performance(
        scope="step_sankey",
        event_name="plot",
    )
    def plot(self, autosize: bool = True, width: int | None = None, height: int | None = None) -> go.Figure:
        """
        Create a Sankey interactive plot based on the calculated values.
        Should be used after :py:func:`fit`.

        Parameters
        ----------
        autosize : bool, default True
            Plotly autosize parameter. See :plotly_autosize:`plotly documentation<>`.
        width : int, optional
            Plot's width (in px). See :plotly_width:`plotly documentation<>`.
        height : int, optional
            Plot's height (in px). See :plotly_height:`plotly documentation<>`.

        Returns
        -------
        plotly.graph_objects.Figure

        """
        called_params = {
            "autosize": autosize,
            "width": width,
            "height": height,
        }

        figure = self._render_plot(
            data_for_plot=self.data_for_plot,
            data_grp_nodes=self.data_grp_nodes,
            autosize=autosize,
            width=width,
            height=height,
        )

        performance_info = {
            "nodes_count": len(self.data_grp_nodes),
            "links_count": len(self.data_grp_links),
        }
        collect_data_performance(
            scope="step_sankey",
            event_name="metadata",
            called_params=called_params,
            performance_data=performance_info,
            eventstream_index=self.__eventstream._eventstream_index,
        )

        return figure

    @property
    @time_performance(
        scope="step_sankey",
        event_name="values",
    )
    def values(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Returns two pd.DataFrames which the Sankey diagram is based on.

        Should be used after :py:func:`fit`.

        Returns
        -------
        tuple[pd.DataFrame, pd.DataFrame]
            1. Contains the nodes of the diagram.
            2. Contains the edges of the diagram.

        """
        return self.data_grp_nodes, self.data_grp_links

    @property
    @time_performance(
        scope="step_sankey",
        event_name="params",
    )
    def params(self) -> dict:
        """
        Returns the parameters used for the last fitting.
        Should be used after :py:func:`fit`.

        """
        return {
            "max_steps": self.max_steps,
            "threshold": self.threshold,
            "sorting": self.sorting,
            "targets": self.targets,
        }