Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Epmctdm 7086 clustering metric #455

Merged
merged 16 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/syngen/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.33
0.9.35rc0
6 changes: 2 additions & 4 deletions src/syngen/ml/metrics/accuracy_test/accuracy_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -525,16 +525,14 @@ <h2 class="h2-title">Content</h2>
clustering
using
K-means is performed on the concatenated dataset. The optimal number of clusters is chosen using
the
elbow
rule. For the good synthetic data the proportion of original to synthetic records in each cluster
the silhouette score. For the good synthetic data the proportion of original to synthetic records in each cluster
should be close to 1:1. The mean clusters homogeneity is calculated as a mean of ratios of
original
to
synthetic
records in each cluster.</p>
<p class="accuracy-report-value"
style="margin-bottom:36px">Median clusters homogeneity: <span>{{clustering_value}}</span> </p>
style="margin-bottom:36px">Mean clusters homogeneity: <span>{{clustering_value}}</span> </p>
<img style="width: 100%;"
src="{{clusters_barplot}}">

Expand Down
2 changes: 1 addition & 1 deletion src/syngen/ml/metrics/accuracy_test/accuracy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _fetch_metrics(self, **kwargs):
clustering_result = round(
self.clustering.calculate_all(kwargs["categ_columns"], kwargs["cont_columns"]), 4
)
logger.info(f"Median clusters homogeneity is {clustering_result}")
logger.info(f"Mean clusters homogeneity is {clustering_result}")
self.update_progress_bar("The clustering metric has been calculated", delta)

self.update_progress_bar("Generation of the utility metric...")
Expand Down
49 changes: 31 additions & 18 deletions src/syngen/ml/metrics/metrics_classes/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
Expand Down Expand Up @@ -912,14 +913,22 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
logger.warning("No clustering metric will be formed due to empty DataFrame")
return None
self.__preprocess_data()
optimal_clust_num = self.__automated_elbow()
optimal_clust_num = self.__automated_silhouette()
statistics = self.__calculate_clusters(optimal_clust_num)
statistics.columns = ["cluster", "dataset", "count"]

def diversity(x):
Ijka marked this conversation as resolved.
Show resolved Hide resolved
return min(x) / max(x)
"""
Calculate the diversity score for each cluster.
If in cluster only one dataset is present, return 0.
"""
if x['dataset'].nunique() == 2:
return min(x['count']) / max(x['count'])
else:
return 0

statistics = self.__calculate_clusters(optimal_clust_num)
statistics.columns = ["cluster", "dataset", "count"]
self.mean_score = statistics.groupby("cluster").agg({"count": diversity}).mean()
diversity_scores = statistics.groupby('cluster').apply(diversity)
self.mean_score = diversity_scores.mean()
Ijka marked this conversation as resolved.
Show resolved Hide resolved

if self.plot:
plt.clf()
Expand Down Expand Up @@ -950,26 +959,29 @@ def diversity(x):
bbox_inches="tight",
format="svg",
)
return self.mean_score.values[0]
return self.mean_score

def __automated_elbow(self):
result_table = {"cluster_num": [], "metric": []}
def __automated_silhouette(self):
result_table = {"cluster_num": [], "silhouette_score": []}
max_clusters = min(10, len(self.merged_transformed))

for i in range(2, max_clusters):
clusters = KMeans(n_clusters=i, random_state=10).fit(self.merged_transformed)
metric = clusters.inertia_
clusters = KMeans(n_clusters=i, random_state=10).fit(
self.merged_transformed
)
labels = clusters.labels_
score = silhouette_score(self.merged_transformed, labels)
result_table["cluster_num"].append(i)
result_table["metric"].append(metric)
result_table["silhouette_score"].append(score)

result_table = pd.DataFrame(result_table)
result_table["d1"] = np.concatenate([[np.nan], np.diff(result_table["metric"])])
result_table["d2"] = np.concatenate([[np.nan], np.diff(result_table["d1"])])
result_table["certainty"] = result_table["d2"] - result_table["d1"]
result_table["certainty"] = (
np.concatenate([[np.nan], result_table["certainty"].values[:-1]])
/ result_table["cluster_num"]
optimal_clusters = (
Ijka marked this conversation as resolved.
Show resolved Hide resolved
result_table.loc[
result_table['silhouette_score'].idxmax(),
'cluster_num'
]
)
return result_table["cluster_num"].values[np.argmax(result_table["certainty"])]
return optimal_clusters

def __preprocess_data(self):
self.merged_transformed = self.merged.apply(
Expand All @@ -982,6 +994,7 @@ def __calculate_clusters(self, n):
clusters = KMeans(n_clusters=n, random_state=10).fit(self.merged_transformed)
labels = clusters.labels_
rows_labels = pd.DataFrame({"origin": self.merged["level_0"], "cluster": labels})

return rows_labels.groupby(["cluster", "origin"]).size().reset_index()


Expand Down