Source code for retentioneering.core.core_functions.step_matrix

# * Copyright (C) 2020 Maxim Godzi, Anatoly Zaytsev, Retentioneering Team
# * This Source Code Form is subject to the terms of the Retentioneering Software Non-Exclusive License (License)
# * By using, sharing or editing this code you agree with the License terms and conditions.
# * You can obtain License text at https://github.com/retentioneering/retentioneering-tools/blob/master/LICENSE.md


import pandas as pd

from retentioneering.visualization import plot_step_matrix


[docs]def step_matrix(self, *, max_steps=20, weight_col=None, precision=2, targets=None, accumulated=None, sorting=None, thresh=0, centered=None, groups=None, show_plot=True): """ Plots heatmap with distribution of users over trajectory steps ordered by event name. Matrix rows are event names, columns are aligned user trajectory step numbers and the values are shares of users. A given entry X at column i and event j means at i'th step fraction of users X have specific event j. Parameters ---------- max_steps: int (optional, default 20) Maximum number of steps in trajectory to include. weight_col: str (optional, default None) Aggregation column for edge weighting. If None, specified index_col from retentioneering.config will be used as column name. For example, can be specified as `session_id` if dataframe has such column. precision: int (optional, default 2) Number of decimal digits after 0 to show as fractions in the heatmap. thresh: float (optional, default 0) Used to remove rare events. Aggregates all rows where all values are less then specified threshold. targets: list (optional, default None) List of events names (as str) to include in the bottom of step_matrix as individual rows. Each specified target will have separate color-coding space for clear visualization. Example: ['product_page', 'cart', 'payment']. If multiple targets need to be compared and plotted using same color-coding scale, such targets must be combined in sub-list. Examples: ['product_page', ['cart', 'payment']] accumulated: string (optional, default None) Option to include accumulated values for targets. Valid values are None (do not show accumulated tartes), 'both' (show step values and accumulated values), 'only' (show targets only as accumulated). centered: dict (optional, default None) Parameter used to align user trajectories at specific event at specific step. Has to contain three keys: 'event': str, name of event to align 'left_gap': int, number of events to include before specified event 'occurrence': int which occurance of event to align (typical 1) When this parameter is not None only users which have specified i'th 'occurance' of selected event preset in their trajectories will be included. Fraction of such remaining users is specified in the title of centered step_matrix. Example: {'event': 'cart', 'left_gap': 8, 'occurrence': 1} sorting: list (optional, default None) List of events_names (as string) can be passed to plot step_matrix with specified ordering of events. If None rows will be ordered according to i`th value (first row, where 1st element is max, second row, where second element is max, etc) groups: tuple (optional, default None) Can be specified to plot step differential step_matrix. Must contain tuple of two elements (g_1, g_2): where g_1 and g_2 are collections of user_id`s (list, tuple or set). Two separate step_matrixes M1 and M2 will be calculated for users from g_1 and g_2, respectively. Resulting matrix will be the matrix M = M1-M2. Note, that values in each column in differential step matrix will sum up to 0 (since columns in both M1 and M2 always sum up to 1). show_plot: bool (optional, default True) whether to show resulting heatmap or not. Returns ------- Dataframe with max_steps number of columns and len(event_col.unique) number of rows at max, or less if used thr > 0. Return type ----------- pd.DataFrame """ event_col = self.retention_config['event_col'] weight_col = weight_col or self.retention_config['user_col'] time_col = self.retention_config['event_time_col'] data = self._obj.copy() # add termination event to each history: data = data.rete.split_sessions(thresh=None, eos_event='ENDED', session_col=None) data['event_rank'] = 1 data['event_rank'] = data.groupby(weight_col)['event_rank'].cumsum() # BY HERE WE NEED TO OBTAIN FINAL DIFF piv and piv_targets before sorting, thresholding and plotting: if groups: data_pos = data[data[weight_col].isin(groups[0])].copy() if len(data_pos) == 0: raise IndexError('Users from positive group are not present in dataset') piv_pos, piv_targets_pos, fraction_title, window, targets_plot = \ _step_matrix_values(data=data_pos, weight_col=weight_col, event_col=event_col, time_col=time_col, targets=targets, accumulated=accumulated, centered=centered, max_steps=max_steps) data_neg = data[data[weight_col].isin(groups[1])].copy() if len(data_pos) == 0: raise IndexError('Users from negative group are not present in dataset') piv_neg, piv_targets_neg, fraction_title, window, targets_plot = \ _step_matrix_values(data=data_neg, weight_col=weight_col, event_col=event_col, time_col=time_col, targets=targets, accumulated=accumulated, centered=centered, max_steps=max_steps) def join_index(df1, df2): full_list = set(df1.index) | set(df2.index) for i in full_list: if i not in df1.index: df1.loc[i] = 0 if i not in df2.index: df2.loc[i] = 0 join_index(piv_pos, piv_neg) piv = piv_pos - piv_neg if targets: join_index(piv_targets_pos, piv_targets_neg) piv_targets = piv_targets_pos - piv_targets_neg else: piv_targets = None else: piv, piv_targets, fraction_title, window, targets_plot = \ _step_matrix_values(data=data, weight_col=weight_col, event_col=event_col, time_col=time_col, targets=targets, accumulated=accumulated, centered=centered, max_steps=max_steps) thresh_index = 'THRESHOLDED_' if thresh != 0: # find if there are any rows to threshold: thresholded = piv.loc[(piv.abs() < thresh).all(1)].copy() if len(thresholded) > 0: piv = piv.loc[(piv.abs() >= thresh).any(1)].copy() thresh_index = f'THRESHOLDED_{len(thresholded)}' piv.loc[thresh_index] = thresholded.sum() if sorting is None: piv = _sort_matrix(piv) keep_in_the_end = [] keep_in_the_end.append('ENDED') if ('ENDED' in piv.index) else None keep_in_the_end.append(thresh_index) if (thresh_index in piv.index) else None events_order = [*(i for i in piv.index if i not in keep_in_the_end), *keep_in_the_end] piv = piv.loc[events_order] # add sorting list to config self.retention_config['step_matrix'] = {'sorting': events_order} else: # if custom sorting was provided: if not isinstance(sorting, list): raise TypeError('parameter `sorting` must be a list of event names') if {*sorting} != {*piv.index}: raise ValueError( 'provided sorting list does not match list of events. Run with `sorting` = None to get the actual list') piv = piv.loc[sorting] if centered: piv.columns = [f'{int(i) - window - 1}' for i in piv.columns] if targets: piv_targets.columns = [f'{int(i) - window - 1}' for i in piv_targets.columns] if show_plot: plot_step_matrix.step_matrix(piv, piv_targets, targets_list=targets_plot, title=f'{"centered" if centered else ""} {"differential " if groups else ""}step matrix {fraction_title}', centered_position=window, precision=precision) return piv
def _step_matrix_values(*, data, weight_col, event_col, time_col, targets, accumulated, centered, max_steps): """ Supplemental function to calculate values for step_matrix Parameters same as in step_matrix Parameters ---------- data weight_col event_col time_col targets accumulated centered max_steps Returns ------- pandas Dataframe """ def pad_cols(df, max_steps): """ Parameters ---------- df - dataframe max_steps - number of cols Returns ------- returns Dataframe with columns from 0 to max_steps """ df = df.copy() if max(df.columns) < max_steps: for col in range(max(df.columns) + 1, max_steps + 1): df[col] = 0 # add missing cols if needed: if min(df.columns) > 1: for col in range(1, min(df.columns)): df[col] = 0 # sort cols return df[list(range(1, max_steps + 1))] from copy import deepcopy data = data.copy() targets = deepcopy(targets) # ALIGN DATA IF CENTRAL fraction_title = '' window = None if centered is not None: # CHECKS if (not isinstance(centered, dict) or ({'event', 'left_gap', 'occurrence'} - {*centered.keys()})): raise ValueError("Parameter centered must be dict with following keys: 'event', 'left_gap', 'occurrence'") center_event = centered.get('event') window = centered.get('left_gap') occurrence = centered.get('occurrence') if occurrence < 1 or not isinstance(occurrence, int): raise ValueError("key value 'occurrence' must be Int >=1") if window < 0 or not isinstance(window, int): raise ValueError("key value 'left_gap' must be Int >=0") if center_event not in data[event_col].unique(): raise ValueError(f'Event "{center_event}" not found in the column: "{event_col}"') # keep only users who have center_event at least N = occurrence times data['occurrence'] = data[event_col] == center_event data['occurance_counter'] = data.groupby(weight_col)['occurrence'].cumsum() * data['occurrence'] users_to_keep = data[data['occurance_counter'] == occurrence][weight_col].unique() if len(users_to_keep) == 0: raise ValueError(f'no records found with event "{center_event}" occuring N={occurrence} times') fraction_used = len(users_to_keep) / data[weight_col].nunique() * 100 if fraction_used < 100: fraction_title = f'({fraction_used:.1f}% of total records)' data = data[data[weight_col].isin(users_to_keep)].copy() def pad_to_center(x): position = x.loc[(x[event_col] == center_event) & (x['occurance_counter'] == occurrence)]['event_rank'].min() shift = position - window - 1 x['event_rank'] = x['event_rank'] - shift return x data = data.groupby(weight_col).apply(pad_to_center) data = data[data['event_rank'] > 0].copy() # calculate step matrix elements: agg = (data .groupby(['event_rank', event_col])[weight_col] .nunique() .reset_index()) agg[weight_col] /= data[weight_col].nunique() agg = agg[agg['event_rank'] <= max_steps] agg.columns = ['event_rank', 'event_name', 'freq'] piv = agg.pivot(index='event_name', columns='event_rank', values='freq').fillna(0) # add missing cols if number of events < max_steps: piv = pad_cols(piv, max_steps) piv.columns.name = None piv.index.name = None # MAKE TERMINATED STATE ACCUMULATED: if 'ENDED' in piv.index: piv.loc['ENDED'] = piv.loc['ENDED'].cumsum().fillna(0) # add NOT_STARTED events for centered matrix if centered: piv.loc['NOT_STARTED'] = 1 - piv.sum() # ADD ROWS FOR TARGETS: piv_targets = None if targets: # obtain flatten list of targets: targets_flatten = list(pd.core.common.flatten(targets)) # format targets to list of lists: for n, i in enumerate(targets): if type(i) != list: targets[n] = [i] agg_targets = (data .groupby(['event_rank', event_col])[time_col] .count() .reset_index()) agg_targets[time_col] /= data[weight_col].nunique() agg_targets.columns = ['event_rank', 'event_name', 'freq'] agg_targets = agg_targets[agg_targets['event_rank'] <= max_steps] piv_targets = agg_targets.pivot(index='event_name', columns='event_rank', values='freq').fillna(0) piv_targets = pad_cols(piv_targets, max_steps) # if target is not present in dataset add zeros: for i in targets_flatten: if i not in piv_targets.index: piv_targets.loc[i] = 0 piv_targets = piv_targets.loc[targets_flatten].copy() piv_targets.columns.name = None piv_targets.index.name = None if accumulated == 'only': piv_targets.index = map(lambda x: 'ACC_' + x, piv_targets.index) for i in piv_targets.index: piv_targets.loc[i] = piv_targets.loc[i].cumsum().fillna(0) # change names is targets list: for target in targets: for j, item in enumerate(target): target[j] = 'ACC_'+item if accumulated == 'both': for i in piv_targets.index: piv_targets.loc['ACC_'+i] = piv_targets.loc[i].cumsum().fillna(0) # add accumulated targets to the list: targets_not_acc = deepcopy(targets) for target in targets: for j, item in enumerate(target): target[j] = 'ACC_'+item targets = targets_not_acc + targets return piv, piv_targets, fraction_title, window, targets def _sort_matrix(step_matrix): x = step_matrix.copy() order = [] for i in x.columns: new_r = x[i].idxmax() order.append(new_r) x = x.drop(new_r) if x.shape[0] == 0: break order.extend(list(set(step_matrix.index) - set(order))) return step_matrix.loc[order]