Merge pull request #444 from tdspora/EPMCTDM-7065_uuid_type_columns

Epmctdm 7065 UUID type columns
tdspora · Aug 27, 2024 · 65a93de · 65a93de
2 parents a3211e5 + fd87775
commit 65a93de
Show file tree

Hide file tree

Showing 11 changed files with 373 additions and 165 deletions.
diff --git a/src/syngen/VERSION b/src/syngen/VERSION
@@ -1 +1 @@
-0.9.28
+0.9.29
diff --git a/src/syngen/ml/handlers/handlers.py b/src/syngen/ml/handlers/handlers.py
@@ -329,10 +329,6 @@ def run_separate(self, params: Tuple):
         if self.has_no_ml:
             synthetic_infer = self.generate_long_texts(size, synthetic_infer)
 
-        uuid_columns = self.dataset.uuid_columns
-        if uuid_columns:
-            synthetic_infer = generate_uuid(size, self.dataset, uuid_columns, synthetic_infer)
-
         return synthetic_infer
 
     def split_by_batches(self):

diff --git a/src/syngen/ml/metrics/accuracy_test/accuracy_test.py b/src/syngen/ml/metrics/accuracy_test/accuracy_test.py
@@ -116,22 +116,40 @@ def __init__(
         super().__init__(original, synthetic, paths, table_name, infer_config)
         self.reports_path = f"{self.paths['reports_path']}/accuracy"
         self.univariate = UnivariateMetric(
-            self.original, self.synthetic, self.plot_exists, self.reports_path
+            self.original,
+            self.synthetic,
+            self.plot_exists,
+            self.reports_path
         )
         self.bivariate = BivariateMetric(
-            self.original, self.synthetic, self.plot_exists, self.reports_path
+            self.original,
+            self.synthetic,
+            self.plot_exists,
+            self.reports_path
         )
         self.correlations = Correlations(
-            self.original, self.synthetic, self.plot_exists, self.reports_path
+            self.original,
+            self.synthetic,
+            self.plot_exists,
+            self.reports_path
         )
         self.clustering = Clustering(
-            self.original, self.synthetic, self.plot_exists, self.reports_path
+            self.original,
+            self.synthetic,
+            self.plot_exists,
+            self.reports_path
         )
         self.utility = Utility(
-            self.original, self.synthetic, self.plot_exists, self.reports_path
+            self.original,
+            self.synthetic,
+            self.plot_exists,
+            self.reports_path
         )
         self.acc = JensenShannonDistance(
-            self.original, self.synthetic, self.plot_exists, self.reports_path
+            self.original,
+            self.synthetic,
+            self.plot_exists,
+            self.reports_path
         )
         self._prepare_dir()
 

diff --git a/src/syngen/ml/metrics/metrics_classes/metrics.py b/src/syngen/ml/metrics/metrics_classes/metrics.py
@@ -24,7 +24,7 @@
 from slugify import slugify
 from loguru import logger
 
-from syngen.ml.utils import get_nan_labels, nan_labels_to_float, timestamp_to_datetime
+from syngen.ml.utils import timestamp_to_datetime
 matplotlib.use("Agg")
 
 
@@ -36,9 +36,8 @@ def __init__(
         plot: bool,
         reports_path: str,
     ):
-        columns_nan_labels = get_nan_labels(original)
-        self.original = nan_labels_to_float(original, columns_nan_labels)
-        self.synthetic = nan_labels_to_float(synthetic, columns_nan_labels)
+        self.original = original
+        self.synthetic = synthetic
         self.reports_path = reports_path
         self.plot = plot
         self.value = None
@@ -1093,13 +1092,21 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
                 )
             )
 
-        best_categ, score_categ, synth_score_categ = self.__create_multi_class_models(categ_cols)
+        (
+            best_categ,
+            score_categ,
+            synth_score_categ
+        ) = self.__create_multi_class_models(categ_cols)
         (
             best_binary,
             score_binary,
             synth_score_binary,
         ) = self.__create_binary_class_models(binary_cols)
-        best_regres, score_regres, synth_regres_score = self.__create_regression_models(cont_cols)
+        (
+            best_regres,
+            score_regres,
+            synth_regres_score
+        ) = self.__create_regression_models(cont_cols)
 
         result = pd.DataFrame(
             {
@@ -1176,23 +1183,32 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
             "as a target and other columns as predictors"
         )
         if best_binary is not None:
+            score = (0 if score_binary == 0 else
+                     round(synth_score_binary/score_binary, 4)
+                     )
             logger.info(log_msg.format(
                 'binary',
-                round(synth_score_binary/score_binary, 4),
+                score,
                 best_binary
                 )
             )
         if best_categ is not None:
+            score = (0 if score_categ == 0 else
+                     round(synth_score_categ/score_categ, 4)
+                     )
             logger.info(log_msg.format(
                 'multiclass',
-                round(synth_score_categ/score_categ, 4),
+                score,
                 best_categ
                 )
             )
         if best_regres is not None:
+            score = (0 if score_regres == 0 else
+                     abs(round(max(0, synth_regres_score) / score_regres, 4))
+                     )
             logger.info(log_msg.format(
                 'regression',
-                abs(round(max(0, synth_regres_score) / score_regres, 4)),
+                score,
                 best_regres
                 )
             )
@@ -1272,7 +1288,7 @@ def __model_process(self, model_object, targets, task_type):
                 logger.info(
                     f"The best score for all possible {task_type} models "
                     f"for the original data is "
-                    f"{best_score}, which is below 0.6. "
+                    f"{round(best_score, 4)}, which is below 0.6. "
                     f"The utility metric is unreliable"
                 )
             synthetic = pd.get_dummies(self.synthetic.drop(best_target, axis=1))

diff --git a/src/syngen/ml/metrics/sample_test/sample_test.py b/src/syngen/ml/metrics/sample_test/sample_test.py
@@ -27,7 +27,12 @@ def __get_univariate_metric(self):
         Do preparation work before creating the report
         """
         self._prepare_dir()
-        return UnivariateMetric(self.original, self.synthetic, True, self.reports_path)
+        return UnivariateMetric(
+            self.original,
+            self.synthetic,
+            True,
+            self.reports_path
+        )
 
     def __remove_artifacts(self):
         """

diff --git a/src/syngen/ml/reporters/reporters.py b/src/syngen/ml/reporters/reporters.py
@@ -7,7 +7,6 @@
 from loguru import logger
 
 from syngen.ml.utils import (
-    get_nan_labels,
     nan_labels_to_float,
     fetch_config,
     datetime_to_timestamp,
@@ -28,27 +27,34 @@ def __init__(self, table_name: str, paths: Dict[str, str], config: Dict[str, str
         self.table_name = table_name
         self.paths = paths
         self.config = config
+        self.dataset = None
+        self.columns_nan_labels = dict()
 
     def _extract_report_data(self):
         original, schema = DataLoader(self.paths["original_data_path"]).load_data()
         synthetic, schema = DataLoader(self.paths["path_to_merged_infer"]).load_data()
         return original, synthetic
 
     def fetch_data_types(self):
-        dataset = fetch_config(self.paths["dataset_pickle_path"])
+        self.dataset = fetch_config(self.paths["dataset_pickle_path"])
+        self.columns_nan_labels = self.dataset.nan_labels_dict
         types = (
-            dataset.str_columns,
-            dataset.date_columns,
-            dataset.int_columns,
-            dataset.float_columns,
-            dataset.binary_columns,
-            dataset.categ_columns,
-            dataset.long_text_columns,
-            dataset.email_columns,
+            self.dataset.str_columns,
+            self.dataset.date_columns,
+            self.dataset.int_columns,
+            self.dataset.float_columns,
+            self.dataset.binary_columns,
+            self.dataset.categ_columns,
+            self.dataset.long_text_columns,
+            self.dataset.email_columns,
         )
 
         # eliminate keys columns from the report
-        keys_columns = set(dataset.pk_columns) | set(dataset.fk_columns) | set(dataset.uq_columns)
+        keys_columns = (
+            set(self.dataset.pk_columns) |
+            set(self.dataset.fk_columns) |
+            set(self.dataset.uq_columns)
+        )
         types = tuple(columns - keys_columns for columns in types)
 
         return types
@@ -69,15 +75,14 @@ def preprocess_data(self):
         Return original data, synthetic data, float columns, integer columns, categorical columns
         without keys columns
         """
+        types = self.fetch_data_types()
         original, synthetic = self._extract_report_data()
         missing_columns = set(original) - set(synthetic)
         for col in missing_columns:
             synthetic[col] = np.nan
-        columns_nan_labels = get_nan_labels(original)
-        original = nan_labels_to_float(original, columns_nan_labels)
-        synthetic = nan_labels_to_float(synthetic, columns_nan_labels)
-        dataset = fetch_config(self.paths["dataset_pickle_path"])
-        types = self.fetch_data_types()
+        exclude_columns = self.dataset.uuid_columns
+        original = nan_labels_to_float(original, self.columns_nan_labels, exclude_columns)
+        synthetic = nan_labels_to_float(synthetic, self.columns_nan_labels, exclude_columns)
         (
             str_columns,
             date_columns,
@@ -91,8 +96,8 @@ def preprocess_data(self):
 
         original = original[[col for col in original.columns if col in set().union(*types)]]
         synthetic = synthetic[[col for col in synthetic.columns if col in set().union(*types)]]
-        na_values = dataset.format.get("na_values", [])
-        for date_col, date_format in dataset.date_mapping.items():
+        na_values = self.dataset.format.get("na_values", [])
+        for date_col, date_format in self.dataset.date_mapping.items():
             original[date_col] = self.convert_to_timestamp(
                 original, date_col, date_format, na_values
             )
@@ -240,7 +245,13 @@ def report(self):
             categ_columns,
             date_columns,
         ) = self.preprocess_data()
-        accuracy_test = AccuracyTest(original, synthetic, self.paths, self.table_name, self.config)
+        accuracy_test = AccuracyTest(
+            original,
+            synthetic,
+            self.paths,
+            self.table_name,
+            self.config,
+        )
         accuracy_test.report(
             cont_columns=list(float_columns | int_columns),
             categ_columns=list(categ_columns),
@@ -273,7 +284,11 @@ def report(self):
             date_columns,
         ) = self.preprocess_data()
         accuracy_test = SampleAccuracyTest(
-            original, sampled, self.paths, self.table_name, self.config
+            original,
+            sampled,
+            self.paths,
+            self.table_name,
+            self.config,
         )
         accuracy_test.report(
             cont_columns=list(float_columns | int_columns),

diff --git a/src/syngen/ml/utils/utils.py b/src/syngen/ml/utils/utils.py
@@ -1,7 +1,7 @@
 import os
 import sys
 import re
-from typing import List, Dict, Optional, Union
+from typing import List, Dict, Optional, Union, Tuple
 from dateutil import parser
 from datetime import datetime, timedelta
 
@@ -197,25 +197,33 @@ def get_nan_labels(df: pd.DataFrame) -> dict:
         if (float_val is not None) and (not np.isnan(float_val)) and len(str_values) == 1:
             nan_label = str_values[0]
             columns_nan_labels[column] = nan_label
+        elif (float_val is not None) and (not np.isnan(float_val)) and not str_values:
+            columns_nan_labels[column] = None
 
     return columns_nan_labels
 
 
-def nan_labels_to_float(df: pd.DataFrame, columns_nan_labels: dict) -> pd.DataFrame:
+def nan_labels_to_float(
+    df: pd.DataFrame,
+    columns_nan_labels: dict,
+    exclude_columns: set = set()
+) -> pd.DataFrame:
     """
-    Replace str nan labels in float/int columns with actual np.nan
+    Replace str nan labels in float/int columns with actual np.NaN
     and casting the column to float type.
-
-    Args:
-        df (pd.DataFrame): table data
-
-    Returns:
-        pd.DataFrame: DataFrame with str NaN labels in float/int columns replaced with np.nan
     """
     df_with_nan = df.copy()
     for column, label in columns_nan_labels.items():
-        df_with_nan[column].replace(label, np.NaN, inplace=True)
-        df_with_nan[column] = df_with_nan[column].astype(float)
+        if column not in exclude_columns:
+            df_with_nan[column].replace(label, np.NaN, inplace=True)
+            df_with_nan[column] = df_with_nan[column].astype(float)
+
+            logger.info(
+                f"Column '{column}' contains unique "
+                f"non-numeric value: '{label}'. "
+                "It will be treated as null label "
+                "and replaced with nulls."
+            )
 
     return df_with_nan