## Prerequisites

Run this cell to prepare the environment. This step is obligatory.

In [1]:
!pip install retentioneering

# Preprocessing

The full text of [Preprocessing graph](https://doc.retentioneering.com/stable/doc/user_guides/preprocessing.html) user guide is available on the retentioneering website.

In [2]:
import pandas as pd
from retentioneering import datasets

stream = datasets.load_simple_shop()

## Preprocessing GUI

In [10]:
pgraph = stream.preprocessing_graph()

In [11]:
def group_products(df, schema):
    return df[schema.event_name].isin(['product1', 'product2'])

In [12]:
def get_new_users(df, schema):
    new_users = df[df[schema.event_name] == 'new_user']\
        [schema.user_id]\
        .unique()
    return df[schema.user_id].isin(new_users)

def get_existing_users(df, schema):
    existing_users = df[df[schema.event_name] == 'existing_user']\
        [schema.user_id]\
        .unique()
    return df[schema.user_id].isin(existing_users)

In [13]:
def remove_truncated_paths(df, schema):
    truncated_users = df[df[schema.event_name].isin(['cropped_left', 'cropped_right'])]\
        [schema.user_id]\
        .unique()
    return ~df[schema.user_id].isin(truncated_users)

In [14]:
# press 'Combine'/'Save & Combine' button in GUI before running this cell
pgraph.combine_result.to_dataframe()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id,session_id
0,ca9cc3ed-5085-4a60-bb8e-038740cb8d6e,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
1,f4b7bcee-65f1-4cbf-b658-179a2e37266e,existing_user,0,existing_user,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
2,da1231c6-e117-4e43-9e21-0af016cd1fbe,session_start,0,session_start,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
3,bb732172-8b25-4204-94e7-af80418abca5,raw,0,catalog,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
4,dd4b2d2c-816d-4a08-a555-11b6a7cb6f12,group_alias,1,product,2019-11-01 17:59:28.459271,219483890.0,219483890.0_1
...,...,...,...,...,...,...,...
56402,18992bc9-a756-4457-b4d9-fba17840b99b,raw,32280,catalog,2020-04-29 12:48:01.809577,501098384.0,501098384.0_6
56403,71bcf87f-f69c-421c-92a2-0b24e4ed9268,raw,32281,main,2020-04-29 12:48:01.938488,501098384.0,501098384.0_6
56404,2805ba82-ba52-4cce-a23b-e86f9a8d9d86,raw,32282,catalog,2020-04-29 12:48:06.595390,501098384.0,501098384.0_6
56405,403258be-3bb5-405d-bdc3-c2f69c25e56d,session_end,32282,session_end,2020-04-29 12:48:06.595390,501098384.0,501098384.0_6


## Code-generated preprocessing graph


In [12]:
from retentioneering.preprocessing_graph import PreprocessingGraph

pgraph = PreprocessingGraph(stream)

### Creating a single node

In [13]:
from retentioneering.preprocessing_graph import EventsNode
from retentioneering.data_processors_lib import GroupEvents, GroupEventsParams

def group_products(df, schema):
    return df[schema.event_name].isin(['product1', 'product2'])

group_events_params = {
    "event_name": "product",
    "func": group_products
}

data_processor_params = GroupEventsParams(**group_events_params)
data_processor = GroupEvents(params=data_processor_params)
node2 = EventsNode(data_processor)

In [14]:
node2 = EventsNode(GroupEvents(params=GroupEventsParams(**group_events_params)))

In [15]:
from retentioneering.data_processors_lib import AddStartEndEvents, AddStartEndEventsParams

node1 = EventsNode(AddStartEndEvents(params=AddStartEndEventsParams()))

### Linking nodes

In [16]:
pgraph.add_node(node=node1, parents=[pgraph.root])
pgraph.add_node(node=node2, parents=[node1])

### Building the whole graph

In [17]:
from retentioneering.data_processors_lib import LabelNewUsers, LabelNewUsersParams

google_spreadsheet_id = '1iggpIT5CZcLILLZ94wCZPQv90tERwi1IB5Y1969C8zc'
link = f'https://docs.google.com/spreadsheets/u/1/d/{google_spreadsheet_id}/export?format=csv&id={google_spreadsheet_id}'
new_users = pd.read_csv(link, header=None)[0].tolist()
node3 = EventsNode(LabelNewUsers(params=LabelNewUsersParams(new_users_list=new_users)))
pgraph.add_node(node=node3, parents=[node2])

In [18]:
from retentioneering.data_processors_lib import FilterEvents, FilterEventsParams

def get_new_users(df, schema):
    new_users = df[df[schema.event_name] == 'new_user']\
        [schema.user_id]\
        .unique()
    return df[schema.user_id].isin(new_users)

def get_existing_users(df, schema):
    existing_users = df[df[schema.event_name] == 'existing_user']\
        [schema.user_id]\
        .unique()
    return df[schema.user_id].isin(existing_users)


node4 = EventsNode(FilterEvents(params=FilterEventsParams(func=get_new_users)))
node5 = EventsNode(FilterEvents(params=FilterEventsParams(func=get_existing_users)))
pgraph.add_node(node=node4, parents=[node3])
pgraph.add_node(node=node5, parents=[node3])

In [19]:
from retentioneering.data_processors_lib import LabelCroppedPaths, LabelCroppedPathsParams

params = {
    "left_cutoff": (1, 'h'),
    "right_cutoff": (1, 'h'),
}
node6 = EventsNode(LabelCroppedPaths(params=LabelCroppedPathsParams(**params)))
pgraph.add_node(node=node6, parents=[node5])

In [20]:
def remove_truncated_paths(df, schema):
    truncated_users = df[df[schema.event_name].isin(['cropped_left', 'cropped_right'])]\
        [schema.user_id]\
        .unique()
    return ~df[schema.user_id].isin(truncated_users)

node7 = EventsNode(FilterEvents(params=FilterEventsParams(func=remove_truncated_paths)))
pgraph.add_node(node=node7, parents=[node6])

In [21]:
from retentioneering.preprocessing_graph import MergeNode

node8 = MergeNode()
pgraph.add_node(node=node8, parents=[node4, node7])

In [22]:
from retentioneering.data_processors_lib import SplitSessions, SplitSessionsParams

node9 = EventsNode(SplitSessions(params=SplitSessionsParams(timeout=(30, 'm'))))
pgraph.add_node(node=node9, parents=[node8])

### Running the calculation


In [23]:
processed_stream = pgraph.combine(node=node9)
processed_stream.to_dataframe().head()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id,session_id
0,5f3c4b0d-36d5-47b5-8f2d-332aa6e206d8,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
1,814840bf-27a3-45b4-bd18-10358a48fa1f,existing_user,0,existing_user,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
2,5190b07d-238b-4907-866b-ee7811757f7d,session_start,0,session_start,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
3,c0f910d4-c2c9-4baf-8300-19a74f7a2b0d,raw,0,catalog,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
4,7933f7d8-fca4-4ed8-b7f3-5436b13e3ca1,group_alias,1,product,2019-11-01 17:59:28.459271,219483890.0,219483890.0_1


### Summary

Here we just provide the same code combined in a single chunk so you could simply copy and paste it and see the results.

In [24]:
import pandas as pd
from retentioneering import datasets
from retentioneering.data_processors_lib import AddStartEndEvents, AddStartEndEventsParams
from retentioneering.data_processors_lib import GroupEvents, GroupEventsParams
from retentioneering.data_processors_lib import LabelNewUsers, LabelNewUsersParams
from retentioneering.data_processors_lib import FilterEvents, FilterEventsParams
from retentioneering.data_processors_lib import LabelCroppedPaths, LabelCroppedPathsParams
from retentioneering.data_processors_lib import SplitSessions, SplitSessionsParams
from retentioneering.preprocessing_graph import PreprocessingGraph, EventsNode, MergeNode

stream = datasets.load_simple_shop()

# node1
node1 = EventsNode(AddStartEndEvents(params=AddStartEndEventsParams()))

# node2
def group_products(df, schema):
    return df[schema.event_name].isin(['product1', 'product2'])

group_events_params={
    "event_name": "product",
    "func": group_products
}
node2 = EventsNode(GroupEvents(params=GroupEventsParams(**group_events_params)))

# node3
google_spreadsheet_id = '1iggpIT5CZcLILLZ94wCZPQv90tERwi1IB5Y1969C8zc'
link = f'https://docs.google.com/spreadsheets/u/1/d/{google_spreadsheet_id}/export?format=csv&id={google_spreadsheet_id}'
new_users = pd.read_csv(link, header=None)[0].tolist()
node3 = EventsNode(LabelNewUsers(params=LabelNewUsersParams(new_users_list=new_users)))

# node4, node5
def get_new_users(df, schema):
    new_users = df[df[schema.event_name] == 'new_user']\
        [schema.user_id]\
        .unique()
    return df[schema.user_id].isin(new_users)

def get_existing_users(df, schema):
    existing_users = df[df[schema.event_name] == 'existing_user']\
        [schema.user_id]\
        .unique()
    return df[schema.user_id].isin(existing_users)

node4 = EventsNode(FilterEvents(params=FilterEventsParams(func=get_new_users)))
node5 = EventsNode(FilterEvents(params=FilterEventsParams(func=get_existing_users)))

# node6
params = {
    "left_cutoff": (1, 'h'),
    "right_cutoff": (1, 'h'),
}
node6 = EventsNode(LabelCroppedPaths(params=LabelCroppedPathsParams(**params)))

# node7, node8, node9
def remove_truncated_paths(df, schema):
    truncated_users = df[df[schema.event_name].isin(['cropped_left', 'cropped_right'])]\
        [schema.user_id]\
        .unique()
    return ~df[schema.user_id].isin(truncated_users)

node7 = EventsNode(FilterEvents(params=FilterEventsParams(func=remove_truncated_paths)))
node8 = MergeNode()
node9 = EventsNode(SplitSessions(params=SplitSessionsParams(timeout=(30, 'm'))))

# linking the nodes to get the graph
pgraph = PreprocessingGraph(stream)
pgraph.add_node(node=node1, parents=[pgraph.root])
pgraph.add_node(node=node2, parents=[node1])
pgraph.add_node(node=node3, parents=[node2])
pgraph.add_node(node=node4, parents=[node3])
pgraph.add_node(node=node5, parents=[node3])
pgraph.add_node(node=node6, parents=[node5])
pgraph.add_node(node=node7, parents=[node6])
pgraph.add_node(node=node8, parents=[node4, node7])
pgraph.add_node(node=node9, parents=[node8])

# getting the calculation results
processed_stream = pgraph.combine(node=node9)
processed_stream.to_dataframe().head()

Unnamed: 0,event_id,event_type,event_index,event,timestamp,user_id,session_id
0,ccef9a9f-112e-4231-a1ed-109d9ca79c4a,path_start,0,path_start,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
1,d491bc30-e638-452f-9d34-0644c310e860,existing_user,0,existing_user,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
2,6037d2f2-4526-432f-9335-5b1a553dbb2e,session_start,0,session_start,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
3,bb1f68c8-dd15-4446-b940-f9ec9688d943,raw,0,catalog,2019-11-01 17:59:13.273932,219483890.0,219483890.0_1
4,e19e9d4b-029e-4e8b-b077-16b77c25578c,group_alias,1,product,2019-11-01 17:59:28.459271,219483890.0,219483890.0_1


## Saving preprocessing config

In [17]:
# save preprocessing config to file
pgraph.export_to_file('/path/to/pgraph_config.json')

# load data and create new PreprocessingGraph instance
new_stream = datasets.load_simple_shop()
new_pgraph = new_stream.preprocessing_graph()

# restore the preserved preprocessing configuration
new_pgraph.import_from_file('/path/to/pgraph_config.json')

Method chaining preprocessing graph
-----------------------------------

In [25]:
stream.transition_graph(edges_norm_type='node')

<retentioneering.tooling.transition_graph.transition_graph.TransitionGraph at 0x17e69eb20>

In [26]:
def group_browsing(df, schema):
    return df[schema.event_name].isin(['catalog', 'main'])

def group_products(df, schema):
    return df[schema.event_name].isin(['product1', 'product2'])

def group_delivery(df, schema):
    return df[schema.event_name].isin(['delivery_choice', 'delivery_courier', 'delivery_pickup'])

def group_payment(df, schema):
    return df[schema.event_name].isin(['payment_choice', 'payment_done', 'payment_card', 'payment_cash'])

stream_7_nodes = stream\
    .drop_paths(min_steps=6)\
    .add_start_end_events()\
    .split_sessions(timeout=(30, 'm'))\
    .group_events(event_name='browsing', func=group_browsing)\
    .group_events(event_name='delivery', func=group_delivery)\
    .group_events(event_name='payment', func=group_payment)

In [27]:
stream_out = stream_7_nodes.collapse_loops()
stream_out.transition_graph(edges_norm_type='node')

<retentioneering.tooling.transition_graph.transition_graph.TransitionGraph at 0x17efa0e80>

In [28]:
stream_out = stream_7_nodes.collapse_loops(suffix='loop')
stream_out.transition_graph(edges_norm_type='node')

<retentioneering.tooling.transition_graph.transition_graph.TransitionGraph at 0x17f037c70>

In [29]:
stream_out = stream_7_nodes.collapse_loops(suffix='count')
stream_out.transition_graph();