Source code for retentioneering.tooling.funnel.funnel

from __future__ import annotations

from collections.abc import Collection
from typing import Any, Literal

import pandas as pd
import plotly.graph_objects as go
from pandas.core.common import flatten

from retentioneering.backend.tracker import track
from retentioneering.eventstream.types import EventstreamType

FunnelTypes = Literal["open", "closed", "hybrid"]


[docs]class Funnel: """ A class for the calculation and visualization of a conversion funnel. Parameters ---------- eventstream : EventstreamType See Also -------- .Eventstream.funnel : Call Funnel tool as an eventstream method. Notes ----- See :doc:`Funnel user guide</user_guides/funnel>` for the details. """ __default_layout = dict( margin={"l": 180, "r": 0, "t": 30, "b": 0, "pad": 0}, funnelmode="stack", showlegend=True, hovermode="closest", legend=dict(orientation="v", bgcolor="#E2E2E2", xanchor="left", font=dict(size=12)), ) __eventstream: EventstreamType stages: list[str] stage_names: list[str] | None funnel_type: FunnelTypes segments: Collection[Collection[int]] | None segment_names: list[str] | None __res_dict: dict[str, dict] @track( # type: ignore tracking_info={"event_name": "init"}, scope="funnel", allowed_params=[], ) def __init__(self, eventstream: EventstreamType) -> None: self.__eventstream = eventstream self.user_col = self.__eventstream.schema.user_id self.event_col = self.__eventstream.schema.event_name self.time_col = self.__eventstream.schema.event_timestamp self.__res_dict = {} def __validate_input( self, stages: list[str], stage_names: list[str] | None = None, funnel_type: FunnelTypes = "closed", segments: Collection[Collection[int]] | None = None, segment_names: list[str] | None = None, ) -> tuple[pd.DataFrame, list[str], list[str], FunnelTypes, Collection[Collection[int]], list[str]]: data = self.__eventstream.to_dataframe(copy=True) data = data[data[self.event_col].isin([i for i in flatten(stages)])] # type: ignore if stages and stage_names and len(stages) != len(stage_names): raise ValueError("stages and stage_names must be the same length!") if segments is None: segments = [data[self.user_col].unique().tolist()] segment_names = ["all users"] else: sets = [set(segment) for segment in segments] if len(set.intersection(*sets)) > 0: raise ValueError("Check intersections of users in segments!") if segment_names is None: segment_names = [f"group {i}" for i in range(len(segments))] # type: ignore if segments and segment_names and len(segments) != len(segment_names): # type: ignore raise ValueError("segments and segment_names must be the same length!") # IDK why but pyright thinks this is Funnel!!! if funnel_type not in ["open", "closed", "hybrid"]: raise ValueError("funnel_type should be 'open', 'closed' or 'hybrid'!") for idx, stage in enumerate(stages): if type(stage) is not list: stages[idx] = [stage] # type: ignore if stage_names is None: stage_names = [] for t in stages: # get name stage_names.append(" | ".join(t).strip(" | ")) stage_names = stage_names return data, stages, stage_names, funnel_type, segments, segment_names def _plot_stacked_funnel(self, data: list[go.Funnel]) -> go.Figure: layout = go.Layout(**self.__default_layout) fig = go.Figure(data, layout) return fig @staticmethod def _calculate_plot_data(plot_params: dict[str, Any]) -> list[go.Funnel]: data = [] for t in plot_params.keys(): trace = go.Funnel( name=t, y=plot_params[t]["stages"], x=plot_params[t]["values"], textinfo="value+percent initial+percent previous", ) data.append(trace) return data def _prepare_data_for_closed_and_hybrid_funnel( self, data: pd.DataFrame, stages: list[str], stage_names: list[str], segments: Collection[Collection[int]], segment_names: list[str], ) -> dict[str, dict]: min_time_0stage = ( data[data[self.event_col].isin(stages[0])].groupby(self.user_col)[[self.time_col]].min().reset_index() ) data = data.merge(min_time_0stage, "left", on=self.user_col, suffixes=("", "_min")) data.rename(columns={data.columns[-1]: "min_date"}, inplace=True) # filtered NA and only events that occurred after the user entered the first funnel event remain data = data[(~data["min_date"].isna()) & (data["min_date"] <= data[self.time_col])] data.drop(columns="min_date", inplace=True) __res_dict = {} for segment, name in zip(segments, segment_names): vals, _df = self._crop_df(data, stages, segment) __res_dict[name] = {"stages": stage_names, "values": vals} return __res_dict def _prepare_data_for_open_funnel( self, data: pd.DataFrame, stages: list[str], stage_names: list[str], segments: Collection[Collection[int]], segment_names: list[str], ) -> dict[str, dict]: __res_dict = {} for segment, name in zip(segments, segment_names): # isolate users from group group_data = data[data[self.user_col].isin(segment)] vals = [group_data[group_data[self.event_col].isin(stage)][self.user_col].nunique() for stage in stages] __res_dict[name] = {"stages": stage_names, "values": vals} return __res_dict def _crop_df(self, df: pd.DataFrame, stages: list[str], segment: Collection[int]) -> tuple[list[int], pd.DataFrame]: first_stage = stages[0] next_stages = stages[1:] first_stage_users = set( (df[(df[self.event_col].isin(first_stage)) & (df[self.user_col].isin(segment))][self.user_col]) ) df = df.drop( df[(~df[self.user_col].isin(first_stage_users)) | (df[self.event_col].isin(first_stage))].index.tolist() ) prev_users_stage = first_stage_users vals = [len(first_stage_users)] for stage in next_stages: user_stage = set( df[(df[self.event_col].isin(stage)) & (df[self.user_col].isin(first_stage_users))][self.user_col] ) user_stage = user_stage - (user_stage - prev_users_stage) prev_users_stage = user_stage vals.append(len(user_stage)) if self.funnel_type == "closed": stage_min_df = ( df[df[self.event_col].isin(stage)].groupby(self.user_col)[[self.time_col]].min().reset_index() ) df = df.merge(stage_min_df, "left", on=self.user_col, suffixes=("", "_min")) df.rename(columns={df.columns[-1]: "min_date"}, inplace=True) df.drop( df[ (df["min_date"].isna()) | (df["min_date"] >= df[self.time_col]) | (~df[self.user_col].isin(user_stage)) ].index.tolist(), inplace=True, ) df.drop(columns="min_date", inplace=True) else: df = df.drop(df[~df[self.user_col].isin(user_stage)].index.tolist()) return vals, df
[docs] @track( # type: ignore tracking_info={"event_name": "fit"}, scope="funnel", allowed_params=[ "stages", "stage_names", "funnel_type", "segments", "segment_names", ], ) def fit( self, stages: list[str], stage_names: list[str] | None = None, funnel_type: FunnelTypes = "closed", segments: Collection[Collection[int]] | None = None, segment_names: list[str] | None = None, ) -> None: """ Calculate the funnel internal values with the defined parameters. Applying ``fit`` method is necessary for the following usage of any visualization or descriptive ``Funnel`` methods. Parameters ---------- stages : list of str List of events used as stages for the funnel. Absolute and relative number of users who reached specified events at least once will be plotted. Multiple events can be grouped together as an individual state by combining them as a sub list. stage_names : list of str, optional List of stage names, this is necessary for stages that include several events. funnel_type : 'open', 'closed' or 'hybrid', default 'closed' - if ``open`` - all users will be counted on each stage; - if ``closed`` - each stage will include only users, that were present on all previous stages; - if ``hybrid`` - combination of 2 previous types. The first stage is required to go further. And for the second and subsequent stages it is important to have all previous stages in their path, but the order of these events is not taken into account. segments : Collection[Collection[int]], optional List of user_ids collections. Funnel for each user_id collection will be plotted. If ``None`` - all users from the dataset will be plotted. A user can only belong to one segment at a time. segment_names : list of str, optional Names of segments. Should be a list from unique values of the ``segment_col``. If ``None`` and ``segment_col`` is given - all values from ``segment_col`` will be used. """ ( data, self.stages, self.stage_names, self.funnel_type, self.segments, self.segment_names, ) = self.__validate_input(stages, stage_names, funnel_type, segments, segment_names) if self.funnel_type in ["closed", "hybrid"]: self.__res_dict = self._prepare_data_for_closed_and_hybrid_funnel( data=data, stages=self.stages, stage_names=self.stage_names, segments=self.segments, segment_names=self.segment_names, ) elif self.funnel_type == "open": self.__res_dict = self._prepare_data_for_open_funnel( data=data, stages=self.stages, segments=self.segments, segment_names=self.segment_names, stage_names=self.stage_names, )
[docs] @track( # type: ignore tracking_info={"event_name": "plot"}, scope="funnel", allowed_params=[], ) def plot(self) -> go.Figure: """ Create a funnel plot based on the calculated funnel values. Should be used after :py:func:`fit`. Returns ------- go.Figure """ result_dict = self.__res_dict data = self._calculate_plot_data(plot_params=result_dict) figure = self._plot_stacked_funnel(data=data) return figure
@property @track( # type: ignore tracking_info={"event_name": "values"}, scope="funnel", allowed_params=[], ) def values(self) -> pd.DataFrame: """ Returns a pd.DataFrame representing the calculated funnel values. Should be used after :py:func:`fit`. Returns ------- pd.DataFrame +------------------+-------------+-----------------+-------------------+-----------------+ | **segment_name** | **stages** | **unique_users**| **%_of_initial** | **%_of_total** | +------------------+-------------+-----------------+-------------------+-----------------+ | segment_1 | stage_1 | 2000 | 100.00 | 100.00 | +------------------+-------------+-----------------+-------------------+-----------------+ """ result_dict = self.__res_dict result_list = [] for key in result_dict: result_ = pd.DataFrame(result_dict[key]) result_.columns = ["stages", "unique_users"] # type: ignore result_["segment_name"] = key result_ = result_[["segment_name", "stages", "unique_users"]] result_["shift"] = result_["unique_users"].shift(periods=1, fill_value=result_["unique_users"][0]) result_["%_of_initial"] = (result_["unique_users"] / result_["shift"] * 100).round(2) result_["%_of_total"] = (result_["unique_users"] / result_["unique_users"][0] * 100).round(2) result_.drop(columns="shift", inplace=True) result_list.append(result_) result_df = pd.concat(result_list).set_index(["segment_name", "stages"]) return result_df @property @track( # type: ignore tracking_info={"event_name": "params"}, scope="funnel", allowed_params=[], ) def params(self) -> dict: """ Returns the parameters used for the last fitting. """ return { "stages": self.stages, "stage_names": self.stage_names, "funnel_type": self.funnel_type, "segments": self.segments, "segment_names": self.segment_names, }