Source code for retentioneering.core.core_functions.project

# * Copyright (C) 2020 Maxim Godzi, Anatoly Zaytsev, Retentioneering Team
# * This Source Code Form is subject to the terms of the Retentioneering Software Non-Exclusive License (License)
# * By using, sharing or editing this code you agree with the License terms and conditions.
# * You can obtain License text at https://github.com/retentioneering/retentioneering-tools/blob/master/LICENSE.md

import pandas as pd

from retentioneering.visualization import plot_project
from .extract_features import _learn_tsne, _learn_umap


[docs]def project(self, *, method='tsne', targets=(), ngram_range=(1,1), feature_type='tfidf', plot_type=None, **kwargs): """ Does dimention reduction of user trajectories and draws projection plane. Parameters ---------- method: {'umap', 'tsne'} (optional, default 'tsne') Type of manifold transformation. plot_type: {'targets', 'clusters', None} (optional, default None) Type of color-coding used for projection visualization: - 'clusters': colors trajectories with different colors depending on cluster number. IMPORTANT: must do .rete.get_clusters() before to obtain cluster mapping. - 'targets': color trajectories based on reach to any event provided in 'targets' parameter. Must provide 'targets' parameter in this case. If None, then only calculates TSNE without visualization. targets: list or tuple of str (optional, default ()) Vector of event_names as str. If user reach any of the specified events, the dot corresponding to this user will be highlighted as converted on the resulting projection plot feature_type: str, (optional, default 'tfidf') Type of vectorizer to use before dimension-reduction. Available vectorization methods: {'tfidf', 'count', 'binary', 'frequency'} ngram_range: tuple, (optional, default (1,1)) The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted before dimension-reduction. For example ngram_range=(1, 1) means only single events, (1, 2) means single events and bigrams. Returns -------- Dataframe with data in the low-dimensional space for user trajectories indexed by user IDs. Return type -------- pd.DataFrame """ event_col = self.retention_config['event_col'] index_col = self.retention_config['user_col'] if plot_type == 'clusters': if hasattr(self, 'clusters'): targets_mapping = self.clusters legend_title = 'cluster number:' else: raise AttributeError("Run .rete.get_clusters() before using plot_type='clusters' to obtain clusters mapping") elif plot_type == 'targets': if targets is None: raise ValueError("When plot_type ='targets' must provide parameter targets as list of target event names") else: targets = [list(pd.core.common.flatten(targets))] legend_title = 'conversion to (' + ' | '.join(targets[0]).strip(' | ') + '):' targets_mapping = (self._obj .groupby(index_col)[event_col] .apply(lambda x: bool(set(*targets) & set(x))) .to_frame() .sort_index()[event_col] .values) features = self.extract_features(feature_type=feature_type, ngram_range=ngram_range) if method == 'tsne': self._projection = _learn_tsne(features, **kwargs) elif method == 'umap': self._projection = _learn_umap(features, **kwargs) # return only embeddings is no plot_type: if plot_type is None: return self._projection plot_project.plot_projection( projection=self._projection.values, targets=targets_mapping, legend_title=legend_title, ) return self._projection