In [1]:
import numpy as np
import pandas as pd

In [2]:
!pip install retentioneering

In [3]:
from retentioneering import datasets
from retentioneering.eventstream import Eventstream
stream = datasets.load_simple_shop()

## Basic example

In [4]:
stream.sequences()

Unnamed: 0_level_0,user_id,user_id_share,count,count_share,avg_count,sequence_type,user_id_sample
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
path_end,3 751,1.0,3 751,0.09,1.0,other,[572097469]
path_start,3 751,1.0,3 751,0.09,1.0,other,[572097469]
catalog,3 611,0.96,14 518,0.36,4.02,other,[962547996]
main,2 385,0.64,5 635,0.14,2.36,other,[670686056]
cart,1 924,0.51,2 842,0.07,1.48,other,[656119954]
product2,1 430,0.38,2 172,0.05,1.52,other,[90644948]
delivery_choice,1 356,0.36,1 686,0.04,1.24,other,[46668818]
product1,1 122,0.3,1 515,0.04,1.35,other,[858537514]
payment_choice,958,0.26,1 107,0.03,1.16,other,[859867220]
delivery_courier,748,0.2,834,0.02,1.11,other,[768244430]


<retentioneering.tooling.sequences.sequences.Sequences at 0x153c2b950>

## Tuning the arguments

In [5]:
stream\
    .split_sessions(timeout=(30, 'm'))\
    .sequences(
        ngram_range=(2, 3),
        weight_col='session_id',
        metrics=['count', 'count_share', 'paths_share'],
        threshold=['count', 1200],
        sorting=['count_share', False],
        heatmap_cols=['session_id_share'],
        sample_size=3
    )

Unnamed: 0_level_0,count,count_share,session_id_share,sequence_type,session_id_sample
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
catalog->catalog,4 857,0.06,0.4,loop,"['122145952_1', '168248042_1', '872837948_1']"
main->catalog,4 064,0.05,0.51,other,"['462395474_2', '736698499_2', '819489198_1']"
session_start->main,3 768,0.04,0.58,other,"['950168808_1', '816437104_1', '154140118_5']"
path_start->session_start,3 751,0.04,0.58,other,"['607824880_1', '343679414_1', '891177460_1']"
session_end->path_end,3 751,0.04,0.58,other,"['607824880_1', '343679414_2', '891177460_1']"
catalog->session_end,2 852,0.03,0.44,other,"['598153538_1', '190589169_1', '57179757_1']"
path_start->session_start->catalog,2 686,0.03,0.42,other,"['316559524_1', '380327213_1', '226199400_1']"
session_start->catalog,2 686,0.03,0.42,other,"['316559524_1', '380327213_1', '226199400_1']"
session_start->main->catalog,2 619,0.03,0.41,other,"['873271748_2', '15921702_2', '550059631_3']"
catalog->product2,2 172,0.03,0.27,other,"['422594385_1', '895368058_1', '166354605_1']"


<retentioneering.tooling.sequences.sequences.Sequences at 0x1507c7050>

## Comparing groups

In [6]:
np.random.seed(111)
users = set(stream.to_dataframe()['user_id'])
group1 = set(np.random.choice(list(users), size=len(users)//2))
group2 = users - group1

In [7]:
stream.sequences(
    groups=[group1), group2],
    group_names=['A', 'B'],
    metrics=['paths_share', 'count_share'],
    threshold=[('user_id_share', 'delta_abs'), 0],
    sorting=[('count_share', 'delta_rel'), False]
)

Unnamed: 0_level_0,user_id_share,user_id_share,user_id_share,user_id_share,count_share,count_share,count_share,count_share,sequence_type,user_id_sample,user_id_sample
Unnamed: 0_level_1,A,B,delta_abs,delta_rel,A,B,delta_abs,delta_rel,Unnamed: 9_level_1,A,B
Sequence,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
product1,0.31,0.29,0.02,0.08,0.04,0.04,0.0,0.08,other,[138113721],[953962586]
main,0.64,0.63,0.01,0.02,0.15,0.14,0.01,0.05,other,[221205779],[263197489]
path_end,1.0,1.0,0.0,0.0,0.09,0.09,-0.0,-0.0,other,[95158569],[91859222]
path_start,1.0,1.0,0.0,0.0,0.09,0.09,-0.0,-0.0,other,[95158569],[91859222]
payment_cash,0.05,0.05,0.0,0.02,0.0,0.01,-0.0,-0.04,other,[68568059],[738009902]


<retentioneering.tooling.sequences.sequences.Sequences at 0x153e209d0>