Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Epmctdm 7086 clustering metric #455

Merged
merged 16 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ The tool is based on the variational autoencoder model (VAE). The Bayesian Gauss

## Prerequisites

Python 3.9 or Python 3.10 is required to run the library. The library is tested on Linux and Windows operating systems.
Python 3.10 is required to run the library. The library is tested on Linux and Windows operating systems.
You can download Python from [the official website](https://www.python.org/downloads/) and install manually, or you can install Python [from your terminal](https://docs.python-guide.org/starting/installation/). After the installation of Python, please, check whether [pip is installed](https://pip.pypa.io/en/stable/getting-started/).

## Getting started
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ reportportal-client
scikit_learn==1.5.*
scipy==1.14.*
seaborn==0.13.*
setuptools>=70.0.0
setuptools==74.1.*
tensorflow==2.15.*
tqdm==4.66.3
Werkzeug==3.0.3
Expand Down
5 changes: 2 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ classifiers =
Operating System :: POSIX :: Linux
Operating System :: Microsoft :: Windows
License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10


Expand All @@ -23,7 +22,7 @@ package_dir =
= src
packages = find:
include_package_data = True
python_requires = >3.8, <3.11
python_requires = >3.9, <3.11
install_requires =
aiohttp>=3.9.0
attrs
Expand Down Expand Up @@ -56,7 +55,7 @@ install_requires =
scikit_learn==1.5.*
scipy==1.14.*
seaborn==0.13.*
setuptools>=70.0.0
setuptools==74.1.*
tensorflow==2.15.*
tqdm==4.66.3
Werkzeug==3.0.3
Expand Down
2 changes: 1 addition & 1 deletion src/syngen/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.35
0.9.36
6 changes: 2 additions & 4 deletions src/syngen/ml/metrics/accuracy_test/accuracy_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -525,16 +525,14 @@ <h2 class="h2-title">Content</h2>
clustering
using
K-means is performed on the concatenated dataset. The optimal number of clusters is chosen using
the
elbow
rule. For the good synthetic data the proportion of original to synthetic records in each cluster
the silhouette score. For the good synthetic data the proportion of original to synthetic records in each cluster
should be close to 1:1. The mean clusters homogeneity is calculated as a mean of ratios of
original
to
synthetic
records in each cluster.</p>
<p class="accuracy-report-value"
style="margin-bottom:36px">Median clusters homogeneity: <span>{{clustering_value}}</span> </p>
style="margin-bottom:36px">Mean clusters homogeneity: <span>{{clustering_value}}</span> </p>
<img style="width: 100%;"
src="{{clusters_barplot}}">

Expand Down
2 changes: 1 addition & 1 deletion src/syngen/ml/metrics/accuracy_test/accuracy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _fetch_metrics(self, **kwargs):
clustering_result = round(
self.clustering.calculate_all(kwargs["categ_columns"], kwargs["cont_columns"]), 4
)
logger.info(f"Median clusters homogeneity is {clustering_result}")
logger.info(f"Mean clusters homogeneity is {clustering_result}")
self.update_progress_bar("The clustering metric has been calculated", delta)

self.update_progress_bar("Generation of the utility metric...")
Expand Down
61 changes: 37 additions & 24 deletions src/syngen/ml/metrics/metrics_classes/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
Expand Down Expand Up @@ -909,17 +910,17 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
.reset_index()
)
if len(self.merged) == 0:
logger.warning("No clustering metric will be formed due to empty DataFrame")
logger.warning(
"No clustering metric will be formed due to empty DataFrame"
)
return None
self.__preprocess_data()
optimal_clust_num = self.__automated_elbow()

def diversity(x):
return min(x) / max(x)

optimal_clust_num = self.__automated_silhouette()
statistics = self.__calculate_clusters(optimal_clust_num)
statistics.columns = ["cluster", "dataset", "count"]
self.mean_score = statistics.groupby("cluster").agg({"count": diversity}).mean()

diversity_scores = statistics.groupby('cluster').apply(self.diversity)
mean_score = diversity_scores.mean()

if self.plot:
plt.clf()
Expand Down Expand Up @@ -950,26 +951,37 @@ def diversity(x):
bbox_inches="tight",
format="svg",
)
return self.mean_score.values[0]
return mean_score

def __automated_elbow(self):
result_table = {"cluster_num": [], "metric": []}
@staticmethod
def diversity(statistics):
"""
Calculate the diversity score for each cluster
from collected statistics.
If in cluster only one dataset is present, return 0.
"""
if statistics['dataset'].nunique() == 2:
return min(statistics['count']) / max(statistics['count'])
else:
return 0

def __automated_silhouette(self):
silhouette_scores = []
max_clusters = min(10, len(self.merged_transformed))

for i in range(2, max_clusters):
clusters = KMeans(n_clusters=i, random_state=10).fit(self.merged_transformed)
metric = clusters.inertia_
result_table["cluster_num"].append(i)
result_table["metric"].append(metric)

result_table = pd.DataFrame(result_table)
result_table["d1"] = np.concatenate([[np.nan], np.diff(result_table["metric"])])
result_table["d2"] = np.concatenate([[np.nan], np.diff(result_table["d1"])])
result_table["certainty"] = result_table["d2"] - result_table["d1"]
result_table["certainty"] = (
np.concatenate([[np.nan], result_table["certainty"].values[:-1]])
/ result_table["cluster_num"]
)
return result_table["cluster_num"].values[np.argmax(result_table["certainty"])]
clusters = KMeans(n_clusters=i, random_state=10).fit(
self.merged_transformed
)
labels = clusters.labels_
score = silhouette_score(self.merged_transformed, labels)
silhouette_scores.append(score)

# Get number of clusters with the highest silhouette score
# +2 because the range starts from 2
optimal_clusters = np.argmax(silhouette_scores) + 2
Ijka marked this conversation as resolved.
Show resolved Hide resolved

return optimal_clusters

def __preprocess_data(self):
self.merged_transformed = self.merged.apply(
Expand All @@ -982,6 +994,7 @@ def __calculate_clusters(self, n):
clusters = KMeans(n_clusters=n, random_state=10).fit(self.merged_transformed)
labels = clusters.labels_
rows_labels = pd.DataFrame({"origin": self.merged["level_0"], "cluster": labels})

return rows_labels.groupby(["cluster", "origin"]).size().reset_index()


Expand Down