From f84bbcf5870bd1fb51cb7904faee09c58694a11c Mon Sep 17 00:00:00 2001
From: Ijka
Date: Fri, 26 Jul 2024 09:54:44 +0000
Subject: [PATCH 1/8] correlation coefficients are changed to spearman. Check
for both original and synthetic data correlation coefficients to be both NaNs
is added
---
.../ml/metrics/metrics_classes/metrics.py | 20 +++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py
index a412fe27..832f207e 100644
--- a/src/syngen/ml/metrics/metrics_classes/metrics.py
+++ b/src/syngen/ml/metrics/metrics_classes/metrics.py
@@ -269,7 +269,23 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
)
)
self.corr_score = self.original_heatmap - self.synthetic_heatmap
- self.corr_score = self.corr_score.dropna(how="all").dropna(how="all", axis=1)
+ self.corr_score = (
+ self.corr_score
+ .dropna(how="all")
+ .dropna(how="all", axis=1)
+ )
+
+ # check if there are any nans left in corr_score
+ if self.corr_score.isna().values.any():
+ # mask for NaNs in both original_heatmap and synthetic_heatmap
+ nan_mask = (
+ np.isnan(self.original_heatmap) &
+ np.isnan(self.synthetic_heatmap)
+ )
+
+ # Set the NaN values in corr_score to 0 where both
+ # original_heatmap and synthetic_heatmap have NaNs
+ self.corr_score[nan_mask] = 0
if self.plot:
plt.clf()
@@ -294,7 +310,7 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
@staticmethod
def __calculate_correlations(data):
- return abs(data.corr())
+ return abs(data.corr(method="spearman"))
class BivariateMetric(BaseMetric):
From 3776d12409389c905d83fb0db56bdb835f78ad99 Mon Sep 17 00:00:00 2001
From: Ijka
Date: Fri, 26 Jul 2024 12:51:40 +0000
Subject: [PATCH 2/8] VERSION updated
---
src/syngen/VERSION | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/syngen/VERSION b/src/syngen/VERSION
index f7522682..a8b7b062 100644
--- a/src/syngen/VERSION
+++ b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.19
+0.9.24rc0
From 379c256fa2dfdb304648907aed7e51628bdb13ff Mon Sep 17 00:00:00 2001
From: Ijka
Date: Fri, 26 Jul 2024 15:05:13 +0000
Subject: [PATCH 3/8] VERSION updated
---
src/syngen/VERSION | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/syngen/VERSION b/src/syngen/VERSION
index a8b7b062..c2aeab43 100644
--- a/src/syngen/VERSION
+++ b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.24rc0
+0.9.24rc1
From 6ec7dae34fc5446794bd81703e49629b52db638c Mon Sep 17 00:00:00 2001
From: Ijka
Date: Mon, 29 Jul 2024 14:03:59 +0000
Subject: [PATCH 4/8] color for NaN values in correcation heatmap is added
---
src/syngen/ml/metrics/metrics_classes/metrics.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py
index 832f207e..19eefaf3 100644
--- a/src/syngen/ml/metrics/metrics_classes/metrics.py
+++ b/src/syngen/ml/metrics/metrics_classes/metrics.py
@@ -289,6 +289,11 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
if self.plot:
plt.clf()
+ # set color for NaN values
+ nan_mask = self.corr_score.isna()
+ # Color for NaNs
+ self.cmap.set_bad('gray')
+
sns.set(rc={"figure.figsize": self.corr_score.shape}, font_scale=2)
heatmap = sns.heatmap(
self.corr_score,
@@ -298,6 +303,7 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
vmax=1.0,
center=0.5,
square=True,
+ mask=nan_mask
)
heatmap.figure.tight_layout()
From 394176d27e221234834c925e61867d29867f561e Mon Sep 17 00:00:00 2001
From: Ijka
Date: Mon, 29 Jul 2024 16:51:45 +0000
Subject: [PATCH 5/8] Info added to accuracy report in case of NaN values left
in correlation_matrix. nan mask is commented for testing
---
.../accuracy_test/accuracy_report.html | 6 ++++-
.../ml/metrics/metrics_classes/metrics.py | 26 +++++++++----------
2 files changed, 18 insertions(+), 14 deletions(-)
diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
index cc23cdea..1361756b 100644
--- a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
+++ b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
@@ -514,7 +514,11 @@ Content
operation
is performed for the synthetic dataset. The resulting matrix is the difference between original
correlation
- matrix and the synthetic one.
+ matrix and the synthetic one.
+ {% if correlation_median != correlation_median %}
+ NaN values in the resulting matrix are represented with gray color.
+ {% endif %}
+
diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py
index 19eefaf3..7adc6688 100644
--- a/src/syngen/ml/metrics/metrics_classes/metrics.py
+++ b/src/syngen/ml/metrics/metrics_classes/metrics.py
@@ -275,23 +275,23 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
.dropna(how="all", axis=1)
)
- # check if there are any nans left in corr_score
- if self.corr_score.isna().values.any():
- # mask for NaNs in both original_heatmap and synthetic_heatmap
- nan_mask = (
- np.isnan(self.original_heatmap) &
- np.isnan(self.synthetic_heatmap)
- )
-
- # Set the NaN values in corr_score to 0 where both
- # original_heatmap and synthetic_heatmap have NaNs
- self.corr_score[nan_mask] = 0
+ # # check if there are any nans left in corr_score
+ # if self.corr_score.isna().values.any():
+ # # mask for NaNs in both original_heatmap and synthetic_heatmap
+ # nan_mask = (
+ # np.isnan(self.original_heatmap) &
+ # np.isnan(self.synthetic_heatmap)
+ # )
+
+ # # Set the NaN values in corr_score to 0 where both
+ # # original_heatmap and synthetic_heatmap have NaNs
+ # self.corr_score[nan_mask] = 0
if self.plot:
plt.clf()
- # set color for NaN values
+ # set mask for NaN values
nan_mask = self.corr_score.isna()
- # Color for NaNs
+ # Color for NaN values
self.cmap.set_bad('gray')
sns.set(rc={"figure.figsize": self.corr_score.shape}, font_scale=2)
From 4c09a3ceabd1b29ba73e32908545bf71a0e0c99b Mon Sep 17 00:00:00 2001
From: Ijka
Date: Wed, 31 Jul 2024 08:07:43 +0000
Subject: [PATCH 6/8] annotations for NaN values are added. Description text in
report is updated
---
.../accuracy_test/accuracy_report.html | 14 +---
.../ml/metrics/metrics_classes/metrics.py | 79 +++++++++++--------
2 files changed, 48 insertions(+), 45 deletions(-)
diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
index 1361756b..9df46461 100644
--- a/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
+++ b/src/syngen/ml/metrics/accuracy_test/accuracy_report.html
@@ -505,18 +505,10 @@ Content
BAD
- The correlation metric is calculated in the following manner: for each
- pair
- of
- numeric columns in
- original
- dataset a pairwise correlations is measured thus resulting in the correlations matrix. The same
- operation
- is performed for the synthetic dataset. The resulting matrix is the difference between original
- correlation
- matrix and the synthetic one.
+
The correlation metric is calculated in the following manner: for each pair of numeric and categorical columns in the original dataset, a pairwise correlation is measured, thus resulting in the correlation matrix. The same operation is performed for the synthetic dataset. The resulting matrix is the difference between the original correlation matrix and the synthetic one.
+
{% if correlation_median != correlation_median %}
- NaN values in the resulting matrix are represented with gray color.
+ NaN values, shown in gray, indicate that correlations could be computed in one dataset but not the other. This suggests significant differences in data characteristics between original and synthetic datasets.
{% endif %}
Date: Wed, 31 Jul 2024 09:18:06 +0000
Subject: [PATCH 7/8] VERSION updated
---
src/syngen/VERSION | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/syngen/VERSION b/src/syngen/VERSION
index c2aeab43..52c7246e 100644
--- a/src/syngen/VERSION
+++ b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.24rc1
+0.9.24rc2
From a62a8c9b89e6c2aaa8439ec562a2d3143dd05167 Mon Sep 17 00:00:00 2001
From: Ijka
Date: Thu, 22 Aug 2024 07:06:00 +0000
Subject: [PATCH 8/8] VERSION updated
---
src/syngen/VERSION | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/syngen/VERSION b/src/syngen/VERSION
index 6a04595c..051fc56b 100644
--- a/src/syngen/VERSION
+++ b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.24rc3
+0.9.28