# * Copyright (C) 2020 Maxim Godzi, Anatoly Zaytsev, Retentioneering Team
# * This Source Code Form is subject to the terms of the Retentioneering Software Non-Exclusive License (License)
# * By using, sharing or editing this code you agree with the License terms and conditions.
# * You can obtain License text at https://github.com/retentioneering/retentioneering-tools/blob/master/LICENSE.md
import pandas as pd
import numpy as np
import umap.umap_ as umap
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
_embedding_types = {'tfidf', 'count', 'binary', 'frequency'}
def _embedder(data, *,
feature_type,
ngram_range=(1, 1)):
"""
Similar to ``frequency_embedder()``, but normalizes event frequencies with inversed document frequency.
Parameters
--------
data: pd.DataFrame
Clickstream dataset.
ngram_range: tuple, optional
Range of ngrams to use in feature extraction. Default: ``(1, 1)``
index_col: str, optional
Name of custom index column, for more information refer to ``init_config``. For instance, if in config you have defined ``index_col`` as ``user_id``, but want to use function over sessions. By default the column defined in ``init_config`` will be used as ``index_col``.
event_col: str, optional
Name of custom event column, for more information refer to ``init_config``. For instance, you may want to aggregate some events or rename and use it as new event column. By default the column defined in ``init_config`` will be used as ``event_col``.
Returns
--------
Dataframe with ``index_col`` vectorized by TF-IDF of events.
Return type
-------
pd.DataFrame
"""
index_col = data.rete.retention_config['user_col']
event_col = data.rete.retention_config['event_col']
corpus = data.groupby(index_col)[event_col].apply(
lambda x: '~~'.join([el.lower() for el in x])
)
if feature_type == 'tfidf':
vectorizer = TfidfVectorizer(ngram_range=ngram_range, token_pattern = '[^~]+').fit(corpus)
elif feature_type in {'count', 'frequency'}:
vectorizer = CountVectorizer(ngram_range=ngram_range, token_pattern='[^~]+').fit(corpus)
elif feature_type == 'binary':
vectorizer = CountVectorizer(ngram_range=ngram_range, token_pattern='[^~]+', binary=True).fit(corpus)
cols = [dict_key[0] for dict_key in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])]
vec_data = pd.DataFrame(index=sorted(data[index_col].unique()),
columns=cols,
data=vectorizer.transform(corpus).todense())
# normalize if frequency:
if feature_type == 'frequency':
vec_data = vec_data.div(vec_data.sum(axis=1), axis=0).fillna(0)
setattr(vec_data.rete, 'datatype', 'features')
return vec_data
def _learn_tsne(data, **kwargs):
"""
Calculates TSNE transformation for given matrix features.
Parameters
--------
data: np.array
Array of features.
kwargs: optional
Parameters for ``sklearn.manifold.TSNE()``
Returns
-------
Calculated TSNE transform
Return type
-------
np.ndarray
"""
TSNE_PARAMS = ['angle', 'early_exaggeration', 'init', 'learning_rate', 'method', 'metric',
'min_grad_norm', 'n_components', 'n_iter', 'n_iter_without_progress', 'n_jobs',
'perplexity', 'verbose']
kwargs = {k: v for k, v in kwargs.items() if k in TSNE_PARAMS}
res = TSNE(random_state=0, **kwargs).fit_transform(data.values)
return pd.DataFrame(res, index=data.index.values)
def _learn_umap(data, **kwargs):
"""
Calculates UMAP transformation for given matrix features.
Parameters
--------
data: np.array
Array of features.
kwargs: optional
Parameters for ``umap.UMAP()``
Returns
-------
Calculated UMAP transform
Return type
-------
np.ndarray
"""
reducer = umap.UMAP()
_umap_filter = reducer.get_params()
kwargs = {k: v for k, v in kwargs.items() if k in _umap_filter}
embedding = umap.UMAP(random_state=0, **kwargs).fit_transform(data.values)
return pd.DataFrame(embedding, index=data.index.values)