Source code for retentioneering.tooling.sequences.sequences

from __future__ import annotations

import itertools
from typing import Callable, Literal, Tuple

import matplotlib.colors as mcolors
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import colormaps, colors
from scipy.sparse import csc_matrix
from sklearn.feature_extraction.text import CountVectorizer

from retentioneering.backend.tracker import (
    collect_data_performance,
    time_performance,
    track,
    tracker,
)
from retentioneering.eventstream.types import EventstreamType

MetricsType = Literal["paths", "paths_share", "count", "count_share"]


[docs]class Sequences:
    """
    A class that provides methods for patterns exploration.

    Parameters
    ----------
    eventstream : EventstreamType


    See Also
    --------
    .Eventstream.sequences : Call Sequences tool as an eventstream method.


    Notes
    -----
    See :doc:`Sequences user guide</user_guides/sequences>` for the details.
    """

    __eventstream: EventstreamType
    ngram_range: Tuple[int, int]
    groups: Tuple[list, list] | None
    weight_col: str
    metrics: MetricsType | None
    final_columns_level_0: list[str]
    group_names: Tuple[str, str] | None
    DEFAULT_GROUP_NAMES = ("group_1", "group_2")
    SEQUENCE_TYPE_GROUPS_COLUMN_NAME = ("sequence_type", "")
    SEQUENCE_TYPE_COLUMN_NAME = "sequence_type"
    RELATIVE_DELTA_COLUMN_NAME = "delta"
    ABS_DELTA_COLUMN_NAME = "delta_abs"
    ORIGINAL_METRICS_LIST = ["paths", "paths_share", "count", "count_share"]

    _vec_data: pd.DataFrame

    @time_performance(
        scope="sequences",
        event_name="init",
    )
    def __init__(self, eventstream: EventstreamType):
        self._default_columns_level_0: list = []
        self._sample_column_name: str | None = None
        self._full_metrics_list: list = []
        self._samples_full = None
        self.__eventstream = eventstream
        self.user_col = self.__eventstream.schema.user_id
        self.event_col = self.__eventstream.schema.event_name
        self.time_col = self.__eventstream.schema.event_timestamp
        self._sequence_df = pd.DataFrame()
        self._show_values_df = pd.DataFrame()

    def _sklearn_vectorization(
        self, events: pd.DataFrame, ngram_range: Tuple[int, int], weight_col: str
    ) -> pd.DataFrame:
        pattern_space = "}}}}}"
        corpus = events.groupby(weight_col)[self.event_col].apply(
            lambda x: "~~".join([el.replace(" ", pattern_space) for el in x])
        )

        vectorizer = CountVectorizer(ngram_range=ngram_range, token_pattern="[^~]+", lowercase=False)
        vectorizer_result = vectorizer.fit_transform(corpus)

        cols = pd.Series(vectorizer.get_feature_names_out()).str.replace(" ", " -> ")
        cols = cols.str.replace(pattern_space, " ")
        data = vectorizer_result.todense()

        vec_data = pd.DataFrame(index=corpus.index, columns=cols, data=data)
        vec_data.index.rename(weight_col, inplace=True)
        return vec_data

    def _calculate_metrics_df(self, vec_data: pd.DataFrame) -> pd.DataFrame:
        sequence_df = (
            pd.concat([vec_data.sum(), vec_data.astype(bool).sum(axis=0)], axis=1, keys=["count", self.weight_col])
            .fillna(0)
            .astype(int)
        )

        sequence_df["count_share"] = sequence_df["count"] / sequence_df["count"].sum()
        sequence_df[f"{self.weight_col}_share"] = sequence_df[self.weight_col] / len(vec_data)
        sequence_df = sequence_df[self._full_metrics_list]

        return sequence_df

    def _add_support_set(self, vec_data: pd.DataFrame, weight_col: str) -> pd.DataFrame:
        sparse_matrix = csc_matrix(vec_data.values)
        idx = vec_data.index
        samples_full = []
        for j, col in enumerate(vec_data.columns):
            non_zero_indices = sparse_matrix.indices[sparse_matrix.indptr[j] : sparse_matrix.indptr[j + 1]]
            samples_full.append([idx[non_zero_indices].tolist()])
        samples_full = pd.DataFrame(samples_full, index=vec_data.columns, columns=[self._sample_column_name])

        return samples_full

    @staticmethod
    def _sequence_type(x: str) -> str:
        temp = x.split(" -> ")
        n = len(temp)
        n_unique = len(set(temp))
        if (n_unique > 1) and (temp[0] == temp[-1]):
            return "cycle"
        if (n_unique == 1) and (n > 1):
            return "loop"
        return "other"

    def _build_groups_df(
        self, vec_data: pd.DataFrame, groups: Tuple[list, list], group_names: Tuple[str, str]
    ) -> pd.DataFrame:
        vec_data["group"] = None
        vec_data.loc[groups[0], "group"] = group_names[0]
        vec_data.loc[groups[1], "group"] = group_names[1]
        group_mask = vec_data["group"].copy()
        vec_data.drop(columns="group", inplace=True)
        group_dfs = []
        samples_dfs = []
        for group in group_names:
            vec_data_group = vec_data[group_mask == group]
            sequence_df_group = self._calculate_metrics_df(vec_data_group)
            sequence_df_group.columns = pd.MultiIndex.from_product([sequence_df_group.columns, [group]])
            group_dfs.append(sequence_df_group)
            samples_full_group = self._add_support_set(vec_data=vec_data_group, weight_col=self.weight_col)
            samples_dfs.append(samples_full_group)

        sequence_df = pd.concat(group_dfs, axis=1)
        sequence_type = pd.Series(sequence_df.index).apply(lambda x: self._sequence_type(x))
        sequence_df[self.SEQUENCE_TYPE_GROUPS_COLUMN_NAME] = sequence_type.values

        samples_full = pd.concat(samples_dfs, axis=1)
        samples_columns = [(self._sample_column_name, group_names[0]), (self._sample_column_name, group_names[1])]
        samples_full.columns = pd.MultiIndex.from_tuples(samples_columns)

        for metric in self._full_metrics_list:
            sequence_df[(metric, self.ABS_DELTA_COLUMN_NAME)] = (
                sequence_df[(metric, group_names[1])] - sequence_df[(metric, group_names[0])]
            )
            sequence_df[(metric, self.RELATIVE_DELTA_COLUMN_NAME)] = (
                sequence_df[(metric, self.ABS_DELTA_COLUMN_NAME)] / sequence_df[(metric, group_names[0])]
            )

        final_columns = (
            list(itertools.product(self._full_metrics_list, self._default_columns_level_0))
            + [self.SEQUENCE_TYPE_GROUPS_COLUMN_NAME]
            + samples_columns
        )
        index_cols = pd.MultiIndex.from_tuples(final_columns)

        sequence_df = sequence_df.merge(samples_full, left_index=True, right_index=True)

        return sequence_df[index_cols]

[docs]    @time_performance(
        scope="sequences",
        event_name="fit",
    )
    def fit(
        self,
        ngram_range: Tuple[int, int] = (1, 1),
        groups: Tuple[list, list] | None = None,
        group_names: Tuple[str, str] | None = None,
        weight_col: str | None = None,
    ) -> None:
        """
        Calculate statistics on n-grams found in eventstream.
        Calculated metrics:

        - ``paths``: The number of unique paths that contain each particular event sequence
          (calculated within the specified ``weight_col``, or ``user_id`` by default).
        - ``paths_share``: The ratio of paths to the sum of paths.
        - ``count``: The number of occurrences of a particular sequence.
        - ``count_share``: The ratio of count to the sum of counts.

        Defined sequences types:

        - ``loop`` - if sequence length >= 2 and all the events are equal.
        - ``cycle`` - if sequence length >= 3 and start and end events are equal.
        - ``other`` - all other sequences.

        Parameters
        ----------
        ngram_range : Tuple(int, int)
            The lower and upper boundary of the range of n-values for different word n-grams to be
            extracted. For example, ngram_range=(1, 1) means only single events, (1, 2) means single events
            and bigrams.
        groups : Tuple(list, list), optional
            Can be specified to calculate statistics for n-grams group-wise and provide delta values.
            Must contain a tuple of two elements (g_1, g_2), where g_1 and g_2 are collections
            of path IDs (for the column specified in the ``weight_col`` parameter).
        group_names : default ('group_1', 'group_2')
            Names for the selected groups g_1 and g_2, which will be shown in the final plot header.
        weight_col : str, optional
            The column used for calculating 'paths' and 'paths_share' metrics.
            If not specified, the ``user_id`` from ``eventstream.schema``
            will be used. For example, it can be specified as ``session_id`` if ``eventstream``
            has such a ``custom_col``.


        Returns
        -------
        None

        Notes
        -----
        See the results of calculation using :py:func:`plot` method and the :py:func:`values` attribute.

        """
        called_params = {
            "ngram_range": ngram_range,
            "groups": groups,
            "group_names": group_names,
            "weight_col": weight_col,
        }
        not_hash_values = ["ngram_range"]

        data = self.__eventstream.to_dataframe()
        self.ngram_range = ngram_range
        self.groups = groups
        self.weight_col = weight_col or self.__eventstream.schema.user_id
        self._sample_column_name = f"{self.weight_col}_sample"
        self._full_metrics_list = [self.weight_col, f"{self.weight_col}_share", "count", "count_share"]

        if group_names:
            self.group_names = group_names

        else:
            self.group_names = self.DEFAULT_GROUP_NAMES

        if self.groups:
            self._default_columns_level_0 = [
                self.group_names[0],
                self.group_names[1],
                self.ABS_DELTA_COLUMN_NAME,
                self.RELATIVE_DELTA_COLUMN_NAME,
            ]
            data = data[data[self.weight_col].isin(list(groups[0]) + list(groups[1]))]  # type: ignore

        vec_data = self._sklearn_vectorization(events=data, ngram_range=self.ngram_range, weight_col=self.weight_col)

        if self.groups:
            self._sequence_df = self._build_groups_df(
                vec_data=vec_data, groups=self.groups, group_names=self.group_names
            )

        else:
            sequence_df = self._calculate_metrics_df(vec_data)

            sequence_type = pd.Series(sequence_df.index).apply(lambda x: self._sequence_type(x))
            sequence_df[self.SEQUENCE_TYPE_COLUMN_NAME] = sequence_type.values
            samples_full = self._add_support_set(vec_data=vec_data, weight_col=self.weight_col)
            self._sequence_df = sequence_df.join(samples_full[self._sample_column_name])

        self._sequence_df.index.name = "Sequence"
        self._show_values_df = self._sequence_df.copy()

        collect_data_performance(
            scope="sequences",
            event_name="metadata",
            called_params=called_params,
            not_hash_values=not_hash_values,
            performance_data={"shape": self._show_values_df.shape},
            eventstream_index=self.__eventstream._eventstream_index,
        )

    def _size_sample_ids(
        self, data: pd.DataFrame, sample_size: int | None, final_columns_level_0: list
    ) -> pd.DataFrame:
        if sample_size:
            displayed_columns = final_columns_level_0 + [self.SEQUENCE_TYPE_COLUMN_NAME] + [self._sample_column_name]
            data = data[displayed_columns].copy()

            def shuffle_and_slice(x: pd.Series, size: int = sample_size) -> pd.Series:  # type: ignore
                np.random.seed(42)
                np.random.shuffle(x)
                return x[:size]

            if self.groups:
                for group in self.group_names:  # type: ignore
                    data[(self._sample_column_name, group)] = data[(self._sample_column_name, group)].apply(
                        lambda x: shuffle_and_slice(x)
                    )
            else:
                data[self._sample_column_name] = data[self._sample_column_name].apply(lambda x: shuffle_and_slice(x))

        else:
            displayed_columns = final_columns_level_0 + [self.SEQUENCE_TYPE_COLUMN_NAME]
            data = self._sequence_df[displayed_columns]

        return data

    @staticmethod
    def _create_plot(
        data: pd.DataFrame, heatmap_columns: list, precision: int | None
    ) -> pd.io.formats.style.Styler:  # type: ignore
        m1 = data.eq(np.inf)
        m2 = data.eq(-np.inf)

        heatmap_plot_mask = data.mask(m1, data[~m1].max(numeric_only=True).max() + 1).mask(
            m2, data[~m2].min(numeric_only=True).min() - 1
        )
        cm = sns.diverging_palette(h_neg=246, h_pos=27, s=99, l=68, as_cmap=True)

        def add_background(s: pd.Series, cmap: str = "PuBu") -> list[str]:  # type: ignore
            a = heatmap_plot_mask.loc[:, s.name].copy()  # type: ignore

            if a.min() >= 0:
                norm = mcolors.TwoSlopeNorm(vmin=-1, vcenter=0, vmax=a.max())

            elif a.max() <= 0:
                norm = mcolors.TwoSlopeNorm(vmin=a.min(), vcenter=0, vmax=1)

            else:
                norm = mcolors.TwoSlopeNorm(vmin=a.min(), vcenter=0.0, vmax=a.max())

            normed = norm(a.values)
            c = [colors.rgb2hex(x) for x in colormaps.get_cmap(cmap)(normed)]
            return ["background-color: %s" % color for color in c]

        plot_sequence_df_styler = data.style.apply(add_background, subset=heatmap_columns, cmap=cm).format(
            precision=precision, thousands=" "
        )
        return plot_sequence_df_styler

    def _check_params(
        self,
        plot_sequence_df: pd.DataFrame,
        metrics: MetricsType | None = None,
        threshold: tuple[str, float | int] | None = None,
        sorting: tuple[str | tuple, bool] | tuple[list[str | tuple], list[bool]] | None = None,
        heatmap_columns: str | list[str | tuple] | None = None,
    ) -> Tuple[list, tuple, tuple, list]:
        replacements = {"paths": self.weight_col, "paths_share": f"{self.weight_col}_share"}
        replacer = replacements.get

        final_columns_full, final_columns_level_0 = self._check_metrics(
            plot_sequence_df=plot_sequence_df, metrics=metrics, replacer=replacer
        )

        heatmap_columns_list: list
        heatmap_columns_list = self._define_heatmap_columns(
            heatmap_columns=heatmap_columns,
            replacer=replacer,
            final_columns_level_0=final_columns_level_0,
            final_columns_full=final_columns_full,
        )

        sorting_checked = self._define_sorting_columns(
            sorting=sorting, replacer=replacer, final_columns_full=final_columns_full
        )

        threshold_checked: Tuple[list | str | None, float | int | None]
        if threshold:
            threshold_checked = self._define_threshold_columns(
                threshold=threshold, replacer=replacer, final_columns_full=final_columns_full
            )
        else:
            threshold_checked = None, None

        return heatmap_columns_list, sorting_checked, threshold_checked, final_columns_level_0

    def _check_metrics(
        self, plot_sequence_df: pd.DataFrame, replacer: Callable, metrics: MetricsType | None = None
    ) -> Tuple[list, list]:
        if self.groups:
            final_columns_level_0 = [self.weight_col]
        else:
            final_columns_level_0 = self._full_metrics_list

        if metrics:
            if isinstance(metrics, str):  # type: ignore
                final_metrics = [metrics]
            else:
                final_metrics = metrics
            if not set(final_metrics).issubset(set(self.ORIGINAL_METRICS_LIST)):
                unexpected_metrics = set(final_metrics).difference(set(self.ORIGINAL_METRICS_LIST))
                raise ValueError(
                    f"""Unexpected metrics value: {unexpected_metrics}. Should be a metric name or
                                 a list of metric names from {self.ORIGINAL_METRICS_LIST}."""
                )

            final_columns_level_0 = [replacer(n, n) for n in final_metrics]
        final_columns_full = list(plot_sequence_df[final_columns_level_0].columns)
        return final_columns_full, final_columns_level_0

    def _define_sorting_columns(
        self,
        sorting: tuple[str | tuple, bool] | tuple[list[str | tuple], list[bool]] | None,
        replacer: Callable,
        final_columns_full: list,
    ) -> Tuple[str | list | tuple, bool | list | tuple]:
        # sorting
        if sorting:
            error_message_text_sorting = f"""Should be tuple or list with two elements:
            (column_name, bool) or ([column_name], [list of bool]). Note that those two elements should be the
            same length. For example: ('count_share', False), ([('count_share', 'delta'),('paths_share', 'delta')],
            [False, True])."""
            sorting_columns: str | list | tuple
            sorting_direction: bool | list | tuple
            if not self.groups:
                if isinstance(sorting, (list, tuple)):  # type: ignore
                    if isinstance(sorting[0], str) and isinstance(sorting[1], bool):
                        sorting_columns = [sorting[0]]
                        sorting_direction = [sorting[1]]

                    elif len(sorting) == 2 and all(isinstance(elem, (list, tuple)) for elem in sorting):
                        sorting_columns = sorting[0]
                        sorting_direction = sorting[1]
                        if (
                            not all(isinstance(elem, str) for elem in sorting_columns)
                            or not all(isinstance(elem, bool) for elem in sorting_direction)  # type: ignore
                            or not len(sorting_columns) == len(sorting_direction)  # type: ignore
                        ):
                            raise TypeError(f"Unexpected sorting parameter type. {error_message_text_sorting}")
                    else:
                        raise TypeError(f"Unexpected sorting parameter type. {error_message_text_sorting}")
                else:
                    raise TypeError(f"Unexpected sorting parameter type. {error_message_text_sorting}")

                sorting_columns = [(replacer(col, col)) for col in sorting_columns]

            else:
                if isinstance(sorting, (list, tuple)):  # type: ignore
                    if isinstance(sorting[0], (list, tuple)) and isinstance(sorting[1], bool):
                        sorting_columns = [sorting[0]]
                        sorting_direction = [sorting[1]]

                    elif len(sorting) == 2 and all(isinstance(elem, (list, tuple)) for elem in sorting):
                        sorting_columns = sorting[0]
                        sorting_direction = sorting[1]
                        if len(sorting_columns) != len(sorting_direction):  # type: ignore
                            raise TypeError(f"Unexpected sorting parameter type. {error_message_text_sorting}")
                    else:
                        raise TypeError(f"Unexpected sorting parameter type. {error_message_text_sorting}")

                sorting_columns = [(replacer(col[0], col[0]), col[1]) for col in sorting_columns]

            if not set(sorting_columns).issubset(set(final_columns_full)):
                unexpected_sorting_columns = set(sorting_columns).difference(set(final_columns_full))
                raise ValueError(
                    f"""Unexpected sorting_columns name: {unexpected_sorting_columns}.
                                     {error_message_text_sorting}"""
                )
        else:
            if self.groups:
                sorting_columns = final_columns_full[3]
                sorting_direction = False
            else:
                sorting_columns = final_columns_full[0]
                sorting_direction = False

        return sorting_columns, sorting_direction

    def _define_threshold_columns(
        self, threshold: tuple[str | list, float | int], replacer: Callable, final_columns_full: list
    ) -> Tuple[list | str | None, float | int | None]:
        threshold_column, threshold_value = threshold
        error_message_text_thresh = f"""Should be a tuple or list with column_name as the first element and int or
                                        float values as the second.
                                        For example: ('count_share', 0.5), (('count_share', 'delta'), 0.5)."""
        if not isinstance(threshold_value, (int, float)):  # type: ignore
            raise TypeError(f"Unexpected threshold parameter type. {error_message_text_thresh}")

        if not self.groups:
            if not isinstance(threshold_column, str):  # type: ignore
                raise TypeError(f"Unexpected threshold parameter type. {error_message_text_thresh}")
            else:
                threshold_column = replacer(threshold_column, threshold_column)

        else:
            if not isinstance(threshold_column, (list, tuple)):
                raise TypeError(f"Unexpected threshold parameter type. {error_message_text_thresh}")
            elif not len(threshold_column) == 2 and all(isinstance(elem, str) for elem in threshold_column):  # type: ignore
                raise TypeError(f"Unexpected threshold parameter type. {error_message_text_thresh}")
            else:
                threshold_column = (replacer(threshold_column[0], threshold_column[0]), threshold_column[1])  # type: ignore

        if threshold_column not in final_columns_full:
            raise ValueError(
                f"""Unexpected sorting_columns name: {threshold_column}.
                                                 {error_message_text_thresh}"""
            )

        return threshold_column, threshold_value  # type: ignore

    def _define_heatmap_columns(
        self,
        heatmap_columns: str | list[str | tuple] | None,
        replacer: Callable,
        final_columns_level_0: list,
        final_columns_full: list,
    ) -> list:
        error_message_text_heatmap = f"""Should be a column name or a list
                                        of column names. For example: ['count_share', 'delta'],
                                        [('count_share', 'delta'),('paths_share', 'delta')]."""
        if heatmap_columns:
            if not self.groups:
                if isinstance(heatmap_columns, str):
                    heatmap_columns = [heatmap_columns]
                elif isinstance(heatmap_columns, (list, tuple)):  # type: ignore
                    if not all(isinstance(col, str) for col in heatmap_columns):
                        raise TypeError(f"Unexpected heatmap_columns type. {error_message_text_heatmap}")
                else:
                    raise TypeError(f"Unexpected heatmap_columns type. {error_message_text_heatmap}")

                heatmap_columns = [replacer(col, col) for col in heatmap_columns]

            else:
                if isinstance(heatmap_columns, (list, tuple)):
                    if len(heatmap_columns) == 2 and all(isinstance(col, str) for col in heatmap_columns):
                        heatmap_columns = [heatmap_columns]  # type: ignore
                    elif not all(isinstance(col, (list, tuple)) for col in heatmap_columns):
                        raise TypeError(f"Unexpected heatmap_columns type. {error_message_text_heatmap}")

                    heatmap_columns = [(replacer(col[0], col[0]), col[1]) for col in heatmap_columns]  # type: ignore

            if not set(heatmap_columns).issubset(set(final_columns_full)):
                unexpected_heatmap_columns = set(heatmap_columns).difference(set(final_columns_full))
                raise ValueError(
                    f"""Unexpected heatmap_columns name: {unexpected_heatmap_columns}.
                                            {error_message_text_heatmap}"""
                )

        else:
            if self.groups:
                heatmap_columns = list(itertools.product(final_columns_level_0, [self.RELATIVE_DELTA_COLUMN_NAME]))
            else:
                heatmap_columns = [f"{self.weight_col}_share"]
        return heatmap_columns  # type: ignore

[docs]    @time_performance(
        scope="sequences",
        event_name="plot",
    )
    def plot(
        self,
        metrics: MetricsType | None = None,
        threshold: tuple[str, float | int] | None = None,
        sorting: tuple[str | tuple, bool] | tuple[list[str | tuple], list[bool]] | None = None,
        heatmap_cols: str | list[str | tuple] | None = None,
        sample_size: int | None = 1,
        precision: int = 2,
    ) -> pd.io.formats.style.Styler:  # type: ignore
        """
        Parameters
        ----------
        metrics : {'paths', 'paths_share', 'count', 'count_share'}, optional
            Specify the metrics to be displayed in the plot.

            - If groups are specified, by default, only the 'paths' metric will be plotted within each group,
              along with both deltas (relative and absolute).
            - If ``groups=None``, all four metrics will be shown by default.

        threshold : tuple[str, float | int], optional
            Used to filter out infrequent sequences based on the specified metric.

            - Example without groups: ('paths', 1200)
            - Example with groups: (('user_id', 'group_1'), 1200)

            Only rows with values greater than or equal to 1200 in the specified column will be displayed.

        sorting : Tuple(str or list of str, bool or list of bool) or None, default None
            - The first element in the tuple: Column name or list of names for sorting.
            - The second element: The sorting order (ascending vs. descending). Specify a list for multiple
              sorting orders.
              If a list of bools is provided, it must match the length of the sorting columns.

        heatmap_cols : str or list of str or list of tuples or None
            Specifies columns to be represented in the heatmap as follows:

            - The heatmap range is calculated column-wise.
            - For columns containing negative values, the palette will be divergent (blue - orange with white as zero).
            - For columns with only positive values, the palette will be orange.
            - For columns with only negative values, the palette will be blue.

            Default values

        sample_size : int or None, default 1
            Number of ID samples to display.
        precision : int, default 2
            Number of decimal digits to show as fractions in the heatmap.

        Returns
        -------
        pd.io.formats.style.Styler
            Styled pd.Dataframe object.

        """
        called_params = {
            "metrics": metrics,
            "threshold": threshold,
            "sorting": sorting,
            "heatmap_cols": heatmap_cols,
            "sample_size": sample_size,
            "precision": precision,
        }

        not_hash_values = ["metrics"]

        plot_sequence_df = self._sequence_df.copy()
        heatmap_columns_list, sorting_checked, threshold_checked, final_columns_level_0 = self._check_params(
            plot_sequence_df, metrics, threshold, sorting, heatmap_cols
        )

        sorting_columns, sorting_direction = sorting_checked
        threshold_column, threshold_value = threshold_checked
        plot_sequence_df = self._size_sample_ids(
            data=plot_sequence_df, sample_size=sample_size, final_columns_level_0=final_columns_level_0
        )

        plot_sequence_df = plot_sequence_df.sort_values(by=sorting_columns, ascending=sorting_direction)
        if threshold:
            plot_sequence_df = plot_sequence_df[plot_sequence_df[threshold_column] >= abs(threshold_value)]

        if precision:
            plot_sequence_df = plot_sequence_df.round(precision)

        plot_sequence_df_styler = self._create_plot(
            data=plot_sequence_df, heatmap_columns=heatmap_columns_list, precision=precision
        )

        self._show_values_df = plot_sequence_df
        collect_data_performance(
            scope="sequences",
            event_name="metadata",
            called_params=called_params,
            not_hash_values=not_hash_values,
            performance_data={"shape": plot_sequence_df.shape},
            eventstream_index=self.__eventstream._eventstream_index,
        )

        return plot_sequence_df_styler

    @property
    @time_performance(
        scope="sequences",
        event_name="values",
    )
    def values(self) -> pd.DataFrame:
        """
        Returns a pd.DataFrame representing the fitted or plotted Sequences table.
        Should be used after :py:func:`fit` or :py:func:`plot`.


        Returns
        -------
        pd.DataFrame



        """
        return self._show_values_df

    @property
    @time_performance(
        scope="sequences",
        event_name="params",
    )
    def params(self) -> dict[str, tuple | None | str]:
        """
        Returns the parameters used for the last fitting.
        Should be used after :py:func:`fit`.

        """
        return {
            "ngram_range": self.ngram_range,
            "groups": self.groups,
            "group_names": self.group_names,
            "weight_col": self.weight_col,
        }