From f84bbcf5870bd1fb51cb7904faee09c58694a11c Mon Sep 17 00:00:00 2001 From: Ijka Date: Fri, 26 Jul 2024 09:54:44 +0000 Subject: [PATCH 1/8] correlation coefficients are changed to spearman. Check for both original and synthetic data correlation coefficients to be both NaNs is added --- .../ml/metrics/metrics_classes/metrics.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py index a412fe27..832f207e 100644 --- a/src/syngen/ml/metrics/metrics_classes/metrics.py +++ b/src/syngen/ml/metrics/metrics_classes/metrics.py @@ -269,7 +269,23 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]): ) ) self.corr_score = self.original_heatmap - self.synthetic_heatmap - self.corr_score = self.corr_score.dropna(how="all").dropna(how="all", axis=1) + self.corr_score = ( + self.corr_score + .dropna(how="all") + .dropna(how="all", axis=1) + ) + + # check if there are any nans left in corr_score + if self.corr_score.isna().values.any(): + # mask for NaNs in both original_heatmap and synthetic_heatmap + nan_mask = ( + np.isnan(self.original_heatmap) & + np.isnan(self.synthetic_heatmap) + ) + + # Set the NaN values in corr_score to 0 where both + # original_heatmap and synthetic_heatmap have NaNs + self.corr_score[nan_mask] = 0 if self.plot: plt.clf() @@ -294,7 +310,7 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]): @staticmethod def __calculate_correlations(data): - return abs(data.corr()) + return abs(data.corr(method="spearman")) class BivariateMetric(BaseMetric): From 3776d12409389c905d83fb0db56bdb835f78ad99 Mon Sep 17 00:00:00 2001 From: Ijka Date: Fri, 26 Jul 2024 12:51:40 +0000 Subject: [PATCH 2/8] VERSION updated --- src/syngen/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/syngen/VERSION b/src/syngen/VERSION index f7522682..a8b7b062 100644 --- a/src/syngen/VERSION +++ b/src/syngen/VERSION @@ -1 +1 @@ -0.9.19 +0.9.24rc0 From 379c256fa2dfdb304648907aed7e51628bdb13ff Mon Sep 17 00:00:00 2001 From: Ijka Date: Fri, 26 Jul 2024 15:05:13 +0000 Subject: [PATCH 3/8] VERSION updated --- src/syngen/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/syngen/VERSION b/src/syngen/VERSION index a8b7b062..c2aeab43 100644 --- a/src/syngen/VERSION +++ b/src/syngen/VERSION @@ -1 +1 @@ -0.9.24rc0 +0.9.24rc1 From 6ec7dae34fc5446794bd81703e49629b52db638c Mon Sep 17 00:00:00 2001 From: Ijka Date: Mon, 29 Jul 2024 14:03:59 +0000 Subject: [PATCH 4/8] color for NaN values in correcation heatmap is added --- src/syngen/ml/metrics/metrics_classes/metrics.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py index 832f207e..19eefaf3 100644 --- a/src/syngen/ml/metrics/metrics_classes/metrics.py +++ b/src/syngen/ml/metrics/metrics_classes/metrics.py @@ -289,6 +289,11 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]): if self.plot: plt.clf() + # set color for NaN values + nan_mask = self.corr_score.isna() + # Color for NaNs + self.cmap.set_bad('gray') + sns.set(rc={"figure.figsize": self.corr_score.shape}, font_scale=2) heatmap = sns.heatmap( self.corr_score, @@ -298,6 +303,7 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]): vmax=1.0, center=0.5, square=True, + mask=nan_mask ) heatmap.figure.tight_layout() From 394176d27e221234834c925e61867d29867f561e Mon Sep 17 00:00:00 2001 From: Ijka Date: Mon, 29 Jul 2024 16:51:45 +0000 Subject: [PATCH 5/8] Info added to accuracy report in case of NaN values left in correlation_matrix. nan mask is commented for testing --- .../accuracy_test/accuracy_report.html | 6 ++++- .../ml/metrics/metrics_classes/metrics.py | 26 +++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html index cc23cdea..1361756b 100644 --- a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html +++ b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html @@ -514,7 +514,11 @@

Content

operation is performed for the synthetic dataset. The resulting matrix is the difference between original correlation - matrix and the synthetic one.

+ matrix and the synthetic one. + {% if correlation_median != correlation_median %} + NaN values in the resulting matrix are represented with gray color. + {% endif %}

+ diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py index 19eefaf3..7adc6688 100644 --- a/src/syngen/ml/metrics/metrics_classes/metrics.py +++ b/src/syngen/ml/metrics/metrics_classes/metrics.py @@ -275,23 +275,23 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]): .dropna(how="all", axis=1) ) - # check if there are any nans left in corr_score - if self.corr_score.isna().values.any(): - # mask for NaNs in both original_heatmap and synthetic_heatmap - nan_mask = ( - np.isnan(self.original_heatmap) & - np.isnan(self.synthetic_heatmap) - ) - - # Set the NaN values in corr_score to 0 where both - # original_heatmap and synthetic_heatmap have NaNs - self.corr_score[nan_mask] = 0 + # # check if there are any nans left in corr_score + # if self.corr_score.isna().values.any(): + # # mask for NaNs in both original_heatmap and synthetic_heatmap + # nan_mask = ( + # np.isnan(self.original_heatmap) & + # np.isnan(self.synthetic_heatmap) + # ) + + # # Set the NaN values in corr_score to 0 where both + # # original_heatmap and synthetic_heatmap have NaNs + # self.corr_score[nan_mask] = 0 if self.plot: plt.clf() - # set color for NaN values + # set mask for NaN values nan_mask = self.corr_score.isna() - # Color for NaNs + # Color for NaN values self.cmap.set_bad('gray') sns.set(rc={"figure.figsize": self.corr_score.shape}, font_scale=2) From 4c09a3ceabd1b29ba73e32908545bf71a0e0c99b Mon Sep 17 00:00:00 2001 From: Ijka Date: Wed, 31 Jul 2024 08:07:43 +0000 Subject: [PATCH 6/8] annotations for NaN values are added. Description text in report is updated --- .../accuracy_test/accuracy_report.html | 14 +--- .../ml/metrics/metrics_classes/metrics.py | 79 +++++++++++-------- 2 files changed, 48 insertions(+), 45 deletions(-) diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html index 1361756b..9df46461 100644 --- a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html +++ b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html @@ -505,18 +505,10 @@

Content

BAD
-

The correlation metric is calculated in the following manner: for each - pair - of - numeric columns in - original - dataset a pairwise correlations is measured thus resulting in the correlations matrix. The same - operation - is performed for the synthetic dataset. The resulting matrix is the difference between original - correlation - matrix and the synthetic one. +

The correlation metric is calculated in the following manner: for each pair of numeric and categorical columns in the original dataset, a pairwise correlation is measured, thus resulting in the correlation matrix. The same operation is performed for the synthetic dataset. The resulting matrix is the difference between the original correlation matrix and the synthetic one. + {% if correlation_median != correlation_median %} - NaN values in the resulting matrix are represented with gray color. + NaN values, shown in gray, indicate that correlations could be computed in one dataset but not the other. This suggests significant differences in data characteristics between original and synthetic datasets. {% endif %}

Date: Wed, 31 Jul 2024 09:18:06 +0000 Subject: [PATCH 7/8] VERSION updated --- src/syngen/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/syngen/VERSION b/src/syngen/VERSION index c2aeab43..52c7246e 100644 --- a/src/syngen/VERSION +++ b/src/syngen/VERSION @@ -1 +1 @@ -0.9.24rc1 +0.9.24rc2 From a62a8c9b89e6c2aaa8439ec562a2d3143dd05167 Mon Sep 17 00:00:00 2001 From: Ijka Date: Thu, 22 Aug 2024 07:06:00 +0000 Subject: [PATCH 8/8] VERSION updated --- src/syngen/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/syngen/VERSION b/src/syngen/VERSION index 6a04595c..051fc56b 100644 --- a/src/syngen/VERSION +++ b/src/syngen/VERSION @@ -1 +1 @@ -0.9.24rc3 +0.9.28