pola-rs · koaning · Dec 7, 2021 · Dec 1, 2021 · Dec 1, 2021 · Dec 2, 2021
diff --git a/setup.py b/setup.py
@@ -15,6 +15,7 @@
     "mktestdocs>=0.1.0",
     "interrogate>=1.2.0",
     "pre-commit>=2.15.0",
+    "pyarrow>=6.0.1",
 ] + all_dep_packages
 
 docs_packages = [

diff --git a/tests/test_error.py b/tests/test_error.py
@@ -0,0 +1,15 @@
+import pytest
+from valves import item_item_counts, sessionize, bayes_average
+
+
+@pytest.mark.parametrize("func", [item_item_counts, sessionize])
+def test_dataf_error_occurs(func):
+    """These functions dont have required arugments."""
+    with pytest.raises(ValueError):
+        func(1)
+
+
+def test_dataf_error_bayes_average():
+    """bayes_average has required arugments."""
+    with pytest.raises(ValueError):
+        bayes_average(1, group_cols="a", target_col="b", C=10)
diff --git a/tests/test_item_item_counts.py b/tests/test_item_item_counts.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import polars as pl
+import dask.dataframe as dd
+
+from valves.polars import item_item_counts as ii_pl
+from valves.pandas import item_item_counts as ii_pd
+from valves.dask import item_item_counts as ii_dd
+from valves import item_item_counts
+
+going_in = [
+    {"user": 1, "item": 1},
+    {"user": 1, "item": 2},
+    {"user": 1, "item": 3},
+    {"user": 2, "item": 2},
+    {"user": 2, "item": 3},
+    {"user": 2, "item": 4},
+]
+
+going_out = [
+    {"item": 1, "item_rec": 2, "n_item": 1, "n_item_rec": 2, "n_both": 1},
+    {"item": 1, "item_rec": 3, "n_item": 1, "n_item_rec": 2, "n_both": 1},
+    {"item": 2, "item_rec": 1, "n_item": 2, "n_item_rec": 1, "n_both": 1},
+    {"item": 2, "item_rec": 3, "n_item": 2, "n_item_rec": 2, "n_both": 2},
+    {"item": 3, "item_rec": 1, "n_item": 2, "n_item_rec": 1, "n_both": 1},
+    {"item": 3, "item_rec": 2, "n_item": 2, "n_item_rec": 2, "n_both": 2},
+    {"item": 2, "item_rec": 4, "n_item": 2, "n_item_rec": 1, "n_both": 1},
+    {"item": 3, "item_rec": 4, "n_item": 2, "n_item_rec": 1, "n_both": 1},
+    {"item": 4, "item_rec": 2, "n_item": 1, "n_item_rec": 2, "n_both": 1},
+    {"item": 4, "item_rec": 3, "n_item": 1, "n_item_rec": 2, "n_both": 1},
+]
+
+
+def test_pandas_item_item_counts():
+    """It needs to work on some simple pandas datasets"""
+    result = pd.DataFrame(going_in).pipe(ii_pd).to_dict(orient="records")
+    assert result == going_out
+    result = pd.DataFrame(going_in).pipe(item_item_counts).to_dict(orient="records")
+    assert result == going_out
+
+
+def test_dask_item_item_counts():
+    """It needs to work on some simple dask datasets"""
+    dask_df = dd.from_pandas(pd.DataFrame(going_in), npartitions=1)
+    result = dask_df.pipe(ii_dd).compute().to_dict(orient="records")
+    assert result == going_out
+    result = dask_df.pipe(item_item_counts).compute().to_dict(orient="records")
+    assert result == going_out
+
+
+def test_polars_item_item_counts():
+    """It needs to work on some simple polars datasets"""
+    result = pl.DataFrame(going_in).pipe(ii_pl).to_pandas().to_dict(orient="records")
+    assert result == going_out
+    result = (
+        pl.DataFrame(going_in)
+        .pipe(item_item_counts)
+        .to_pandas()
+        .to_dict(orient="records")
+    )
+    assert result == going_out
diff --git a/valves/__init__.py b/valves/__init__.py
@@ -7,6 +7,7 @@
 else:
     from .polars import sessionize as sess_pl
     from .polars import bayes_average as bayes_average_pl
+    from .polars import item_item_counts as ii_pl
 
 try:
     import pandas as pd
@@ -17,6 +18,7 @@
 else:
     from .pandas import sessionize as sess_pd
     from .pandas import bayes_average as bayes_average_pd
+    from .pandas import item_item_counts as ii_pd
 
 try:
     import dask.dataframe as dd
@@ -27,6 +29,7 @@
 else:
     from .dask import sessionize as sess_dd
     from .dask import bayes_average as bayes_average_dd
+    from .dask import item_item_counts as ii_dd
 
 
 def _raise_dataf_error(dataf):
@@ -79,7 +82,7 @@ def bayes_average(
     This function is meant to be used in a `.pipe()`-line.
 
     Arguments:
-        - dataf: dask dataframe
+        - dataf: pandas, polars or dask dataframe
         - group_cols: list of columns to group by
         - target_col: name of the column containing the target value, typically a rating
         - C: smoothing parameter
@@ -93,3 +96,23 @@ def bayes_average(
     if _POLARS_AVAILABLE and isinstance(dataf, pl.DataFrame):
         return bayes_average_pl(dataf, group_cols, target_col, C, prior_mean, out_col)
     _raise_dataf_error(dataf)
+
+
+def item_item_counts(dataf, user_col="user", item_col="item"):
+    """
+    Computes item-item overlap counts from user-item interactions, useful for recommendations.
+
+    This function is meant to be used in a `.pipe()`-line.
+
+    Arguments:
+        - dataf: pandas, polars or dask dataframe
+        - user_col: name of the column containing the user id
+        - item_col: name of the column containing the item id
+    """
+    if _DASK_AVAILABLE and isinstance(dataf, dd.DataFrame):
+        return ii_dd(dataf, user_col, item_col)
+    if _PANDAS_AVAILABLE and isinstance(dataf, pd.DataFrame):
+        return ii_pd(dataf, user_col, item_col)
+    if _POLARS_AVAILABLE and isinstance(dataf, pl.DataFrame):
+        return ii_pl(dataf, user_col, item_col)
+    _raise_dataf_error(dataf)
diff --git a/valves/dask.py b/valves/dask.py
@@ -53,3 +53,46 @@ def bayes_average(
         .drop(columns=["sum", "size"])
     )
     return dataf.join(to_join_back, on=group_cols).reset_index()
+
+
+def item_item_counts(dataf, user_col="user", item_col="item"):
+    """
+    Computes item-item overlap counts from user-item interactions, useful for recommendations.
+
+    This function is meant to be used in a `.pipe()`-line.
+
+    Arguments:
+        - dataf: dask dataframe
+        - user_col: name of the column containing the user id
+        - item_col: name of the column containing the item id
+    """
+    user_items = dataf[["user", "item"]].drop_duplicates()
+    return (
+        user_items.merge(user_items, how="left", on="user")
+        .rename(columns={f"{item_col}_x": item_col, f"{item_col}_y": f"{item_col}_rec"})
+        .loc[lambda d: d[item_col] != d[f"{item_col}_rec"]]
+        .assign(
+            **{
+                "n_both": lambda s: s.groupby([item_col, f"{item_col}_rec"])[
+                    user_col
+                ].transform(lambda d: d.nunique()),
+                f"n_{item_col}_rec": lambda s: s.groupby([f"{item_col}_rec"])[
+                    user_col
+                ].transform(lambda d: d.nunique()),
+                f"n_{item_col}": lambda s: s.groupby([item_col])[user_col].transform(
+                    lambda d: d.nunique()
+                ),
+            }
+        )
+        .drop(columns=[user_col])
+        .drop_duplicates()
+        .reset_index(drop=True)[
+            [
+                f"{item_col}",
+                f"{item_col}_rec",
+                f"n_{item_col}",
+                f"n_{item_col}_rec",
+                "n_both",
+            ]
+        ]
+    )
diff --git a/valves/pandas.py b/valves/pandas.py
@@ -53,3 +53,46 @@ def bayes_average(
         .drop(columns=["_s", "_c"])
     )
     return dataf.join(to_join_back, on=group_cols).reset_index()
+
+
+def item_item_counts(dataf, user_col="user", item_col="item"):
+    """
+    Computes item-item overlap counts from user-item interactions, useful for recommendations.
+
+    This function is meant to be used in a `.pipe()`-line.
+
+    Arguments:
+        - dataf: pandas dataframe
+        - user_col: name of the column containing the user id
+        - item_col: name of the column containing the item id
+    """
+    user_items = dataf[["user", "item"]].drop_duplicates()
+    return (
+        user_items.merge(user_items, how="left", on="user")
+        .rename(columns={f"{item_col}_x": item_col, f"{item_col}_y": f"{item_col}_rec"})
+        .loc[lambda d: d[item_col] != d[f"{item_col}_rec"]]
+        .assign(
+            **{
+                "n_both": lambda s: s.groupby([item_col, f"{item_col}_rec"])[
+                    user_col
+                ].transform(lambda d: d.nunique()),
+                f"n_{item_col}_rec": lambda s: s.groupby([f"{item_col}_rec"])[
+                    user_col
+                ].transform(lambda d: d.nunique()),
+                f"n_{item_col}": lambda s: s.groupby([item_col])[user_col].transform(
+                    lambda d: d.nunique()
+                ),
+            }
+        )
+        .drop(columns=[user_col])
+        .drop_duplicates()
+        .reset_index(drop=True)[
+            [
+                f"{item_col}",
+                f"{item_col}_rec",
+                f"n_{item_col}",
+                f"n_{item_col}_rec",
+                "n_both",
+            ]
+        ]
+    )
diff --git a/valves/polars.py b/valves/polars.py
@@ -62,3 +62,51 @@ def bayes_average(
             / (C + pl.col(target_col).count().over(group_cols))
         ).alias(out_col)
     )
+
+
+def item_item_counts(dataf, user_col="user", item_col="item"):
+    """
+    Computes item-item overlap counts from user-item interactions, useful for recommendations.
+
+    This function is meant to be used in a `.pipe()`-line.
+
+    Arguments:
+        - dataf: polars dataframe
+        - user_col: name of the column containing the user id
+        - item_col: name of the column containing the item id
+    """
+    return (
+        dataf.with_columns(
+            [
+                pl.col(item_col).list().over("user").alias(f"{item_col}_rec"),
+            ]
+        )
+        .explode(f"{item_col}_rec")
+        .filter(pl.col(item_col) != pl.col(f"{item_col}_rec"))
+        .with_columns(
+            [
+                pl.col(user_col)
+                .n_unique()
+                .over(pl.col(item_col))
+                .alias(f"n_{item_col}"),
+                pl.col(user_col)
+                .n_unique()
+                .over(f"{item_col}_rec")
+                .alias(f"n_{item_col}_rec"),
+                pl.col(user_col)
+                .n_unique()
+                .over([pl.col(item_col), f"{item_col}_rec"])
+                .alias("n_both"),
+            ]
+        )
+        .select(
+            [
+                f"{item_col}",
+                f"{item_col}_rec",
+                f"n_{item_col}",
+                f"n_{item_col}_rec",
+                "n_both",
+            ]
+        )
+        .drop_duplicates()
+    )