tdspora · Anna050689 · Sep 13, 2024 · Aug 27, 2024 · Aug 27, 2024 · Sep 2, 2024
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@ The tool is based on the variational autoencoder model (VAE). The Bayesian Gauss
 
 ## Prerequisites
 
-Python 3.9 or Python 3.10 is required to run the library. The library is tested on Linux and Windows operating systems.
+Python 3.10 is required to run the library. The library is tested on Linux and Windows operating systems.
 You can download Python from [the official website](https://www.python.org/downloads/) and install manually, or you can install Python [from your terminal](https://docs.python-guide.org/starting/installation/). After the installation of Python, please, check whether [pip is installed](https://pip.pypa.io/en/stable/getting-started/).
 
 ## Getting started

diff --git a/requirements.txt b/requirements.txt
@@ -29,7 +29,7 @@ reportportal-client
 scikit_learn==1.5.*
 scipy==1.14.*
 seaborn==0.13.*
-setuptools>=70.0.0
+setuptools==74.1.*
 tensorflow==2.15.*
 tqdm==4.66.3
 Werkzeug==3.0.3

diff --git a/setup.cfg b/setup.cfg
@@ -14,7 +14,6 @@ classifiers =
     Operating System :: POSIX :: Linux
     Operating System :: Microsoft :: Windows
     License :: OSI Approved :: GNU General Public License v3 (GPLv3)
-    Programming Language :: Python :: 3.9
     Programming Language :: Python :: 3.10
 
 
@@ -23,7 +22,7 @@ package_dir =
      = src
 packages = find:
 include_package_data = True
-python_requires = >3.8, <3.11
+python_requires = >3.9, <3.11
 install_requires =
     aiohttp>=3.9.0
     attrs
@@ -56,7 +55,7 @@ install_requires =
     scikit_learn==1.5.*
     scipy==1.14.*
     seaborn==0.13.*
-    setuptools>=70.0.0
+    setuptools==74.1.*
     tensorflow==2.15.*
     tqdm==4.66.3
     Werkzeug==3.0.3

diff --git a/src/syngen/VERSION b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.35
+0.9.36
diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
@@ -525,16 +525,14 @@ <h2 class="h2-title">Content</h2>
 						clustering
 						using
 						K-means is performed on the concatenated dataset. The optimal number of clusters is chosen using
-						the
-						elbow
-						rule. For the good synthetic data the proportion of original to synthetic records in each cluster
+						the silhouette score. For the good synthetic data the proportion of original to synthetic records in each cluster
 						should be close to 1:1. The mean clusters homogeneity is calculated as a mean of ratios of
 						original
 						to
 						synthetic
 						records in each cluster.</p>
 					<p class="accuracy-report-value"
-						style="margin-bottom:36px">Median clusters homogeneity: <span>{{clustering_value}}</span> </p>
+						style="margin-bottom:36px">Mean clusters homogeneity: <span>{{clustering_value}}</span> </p>
 					<img style="width: 100%;"
 						src="{{clusters_barplot}}">
 

diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_test.py b/src/syngen/ml/metrics/accuracy_test/accuracy_test.py
@@ -198,7 +198,7 @@ def _fetch_metrics(self, **kwargs):
         clustering_result = round(
             self.clustering.calculate_all(kwargs["categ_columns"], kwargs["cont_columns"]), 4
         )
-        logger.info(f"Median clusters homogeneity is {clustering_result}")
+        logger.info(f"Mean clusters homogeneity is {clustering_result}")
         self.update_progress_bar("The clustering metric has been calculated", delta)
 
         self.update_progress_bar("Generation of the utility metric...")

diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py
@@ -7,6 +7,7 @@
 
 import tqdm
 from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import RandomForestRegressor
@@ -909,17 +910,17 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
             .reset_index()
         )
         if len(self.merged) == 0:
-            logger.warning("No clustering metric will be formed due to empty DataFrame")
+            logger.warning(
+                "No clustering metric will be formed due to empty DataFrame"
+            )
             return None
         self.__preprocess_data()
-        optimal_clust_num = self.__automated_elbow()
-
-        def diversity(x):
-            return min(x) / max(x)
-
+        optimal_clust_num = self.__automated_silhouette()
         statistics = self.__calculate_clusters(optimal_clust_num)
         statistics.columns = ["cluster", "dataset", "count"]
-        self.mean_score = statistics.groupby("cluster").agg({"count": diversity}).mean()
+
+        diversity_scores = statistics.groupby('cluster').apply(self.diversity)
+        mean_score = diversity_scores.mean()
 
         if self.plot:
             plt.clf()
@@ -950,26 +951,37 @@ def diversity(x):
                 bbox_inches="tight",
                 format="svg",
             )
-        return self.mean_score.values[0]
+        return mean_score
 
-    def __automated_elbow(self):
-        result_table = {"cluster_num": [], "metric": []}
+    @staticmethod
+    def diversity(statistics):
+        """
+        Calculate the diversity score for each cluster
+        from collected statistics.
+        If in cluster only one dataset is present, return 0.
+        """
+        if statistics['dataset'].nunique() == 2:
+            return min(statistics['count']) / max(statistics['count'])
+        else:
+            return 0
+
+    def __automated_silhouette(self):
+        silhouette_scores = []
         max_clusters = min(10, len(self.merged_transformed))
+
         for i in range(2, max_clusters):
-            clusters = KMeans(n_clusters=i, random_state=10).fit(self.merged_transformed)
-            metric = clusters.inertia_
-            result_table["cluster_num"].append(i)
-            result_table["metric"].append(metric)
-
-        result_table = pd.DataFrame(result_table)
-        result_table["d1"] = np.concatenate([[np.nan], np.diff(result_table["metric"])])
-        result_table["d2"] = np.concatenate([[np.nan], np.diff(result_table["d1"])])
-        result_table["certainty"] = result_table["d2"] - result_table["d1"]
-        result_table["certainty"] = (
-            np.concatenate([[np.nan], result_table["certainty"].values[:-1]])
-            / result_table["cluster_num"]
-        )
-        return result_table["cluster_num"].values[np.argmax(result_table["certainty"])]
+            clusters = KMeans(n_clusters=i, random_state=10).fit(
+                self.merged_transformed
+            )
+            labels = clusters.labels_
+            score = silhouette_score(self.merged_transformed, labels)
+            silhouette_scores.append(score)
+
+        # Get number of clusters with the highest silhouette score
+        # +2 because the range starts from 2
+        optimal_clusters = np.argmax(silhouette_scores) + 2
+
+        return optimal_clusters
 
     def __preprocess_data(self):
         self.merged_transformed = self.merged.apply(
@@ -982,6 +994,7 @@ def __calculate_clusters(self, n):
         clusters = KMeans(n_clusters=n, random_state=10).fit(self.merged_transformed)
         labels = clusters.labels_
         rows_labels = pd.DataFrame({"origin": self.merged["level_0"], "cluster": labels})
+
         return rows_labels.groupby(["cluster", "origin"]).size().reset_index()