Skip to content

Commit

Permalink
Merge pull request #444 from tdspora/EPMCTDM-7065_uuid_type_columns
Browse files Browse the repository at this point in the history
Epmctdm 7065 UUID type columns
  • Loading branch information
Ijka authored Aug 27, 2024
2 parents a3211e5 + fd87775 commit 65a93de
Show file tree
Hide file tree
Showing 11 changed files with 373 additions and 165 deletions.
2 changes: 1 addition & 1 deletion src/syngen/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.28
0.9.29
4 changes: 0 additions & 4 deletions src/syngen/ml/handlers/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,10 +329,6 @@ def run_separate(self, params: Tuple):
if self.has_no_ml:
synthetic_infer = self.generate_long_texts(size, synthetic_infer)

uuid_columns = self.dataset.uuid_columns
if uuid_columns:
synthetic_infer = generate_uuid(size, self.dataset, uuid_columns, synthetic_infer)

return synthetic_infer

def split_by_batches(self):
Expand Down
30 changes: 24 additions & 6 deletions src/syngen/ml/metrics/accuracy_test/accuracy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,22 +116,40 @@ def __init__(
super().__init__(original, synthetic, paths, table_name, infer_config)
self.reports_path = f"{self.paths['reports_path']}/accuracy"
self.univariate = UnivariateMetric(
self.original, self.synthetic, self.plot_exists, self.reports_path
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
)
self.bivariate = BivariateMetric(
self.original, self.synthetic, self.plot_exists, self.reports_path
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
)
self.correlations = Correlations(
self.original, self.synthetic, self.plot_exists, self.reports_path
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
)
self.clustering = Clustering(
self.original, self.synthetic, self.plot_exists, self.reports_path
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
)
self.utility = Utility(
self.original, self.synthetic, self.plot_exists, self.reports_path
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
)
self.acc = JensenShannonDistance(
self.original, self.synthetic, self.plot_exists, self.reports_path
self.original,
self.synthetic,
self.plot_exists,
self.reports_path
)
self._prepare_dir()

Expand Down
36 changes: 26 additions & 10 deletions src/syngen/ml/metrics/metrics_classes/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from slugify import slugify
from loguru import logger

from syngen.ml.utils import get_nan_labels, nan_labels_to_float, timestamp_to_datetime
from syngen.ml.utils import timestamp_to_datetime
matplotlib.use("Agg")


Expand All @@ -36,9 +36,8 @@ def __init__(
plot: bool,
reports_path: str,
):
columns_nan_labels = get_nan_labels(original)
self.original = nan_labels_to_float(original, columns_nan_labels)
self.synthetic = nan_labels_to_float(synthetic, columns_nan_labels)
self.original = original
self.synthetic = synthetic
self.reports_path = reports_path
self.plot = plot
self.value = None
Expand Down Expand Up @@ -1093,13 +1092,21 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
)
)

best_categ, score_categ, synth_score_categ = self.__create_multi_class_models(categ_cols)
(
best_categ,
score_categ,
synth_score_categ
) = self.__create_multi_class_models(categ_cols)
(
best_binary,
score_binary,
synth_score_binary,
) = self.__create_binary_class_models(binary_cols)
best_regres, score_regres, synth_regres_score = self.__create_regression_models(cont_cols)
(
best_regres,
score_regres,
synth_regres_score
) = self.__create_regression_models(cont_cols)

result = pd.DataFrame(
{
Expand Down Expand Up @@ -1176,23 +1183,32 @@ def calculate_all(self, categ_columns: List[str], cont_columns: List[str]):
"as a target and other columns as predictors"
)
if best_binary is not None:
score = (0 if score_binary == 0 else
round(synth_score_binary/score_binary, 4)
)
logger.info(log_msg.format(
'binary',
round(synth_score_binary/score_binary, 4),
score,
best_binary
)
)
if best_categ is not None:
score = (0 if score_categ == 0 else
round(synth_score_categ/score_categ, 4)
)
logger.info(log_msg.format(
'multiclass',
round(synth_score_categ/score_categ, 4),
score,
best_categ
)
)
if best_regres is not None:
score = (0 if score_regres == 0 else
abs(round(max(0, synth_regres_score) / score_regres, 4))
)
logger.info(log_msg.format(
'regression',
abs(round(max(0, synth_regres_score) / score_regres, 4)),
score,
best_regres
)
)
Expand Down Expand Up @@ -1272,7 +1288,7 @@ def __model_process(self, model_object, targets, task_type):
logger.info(
f"The best score for all possible {task_type} models "
f"for the original data is "
f"{best_score}, which is below 0.6. "
f"{round(best_score, 4)}, which is below 0.6. "
f"The utility metric is unreliable"
)
synthetic = pd.get_dummies(self.synthetic.drop(best_target, axis=1))
Expand Down
7 changes: 6 additions & 1 deletion src/syngen/ml/metrics/sample_test/sample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ def __get_univariate_metric(self):
Do preparation work before creating the report
"""
self._prepare_dir()
return UnivariateMetric(self.original, self.synthetic, True, self.reports_path)
return UnivariateMetric(
self.original,
self.synthetic,
True,
self.reports_path
)

def __remove_artifacts(self):
"""
Expand Down
55 changes: 35 additions & 20 deletions src/syngen/ml/reporters/reporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from loguru import logger

from syngen.ml.utils import (
get_nan_labels,
nan_labels_to_float,
fetch_config,
datetime_to_timestamp,
Expand All @@ -28,27 +27,34 @@ def __init__(self, table_name: str, paths: Dict[str, str], config: Dict[str, str
self.table_name = table_name
self.paths = paths
self.config = config
self.dataset = None
self.columns_nan_labels = dict()

def _extract_report_data(self):
original, schema = DataLoader(self.paths["original_data_path"]).load_data()
synthetic, schema = DataLoader(self.paths["path_to_merged_infer"]).load_data()
return original, synthetic

def fetch_data_types(self):
dataset = fetch_config(self.paths["dataset_pickle_path"])
self.dataset = fetch_config(self.paths["dataset_pickle_path"])
self.columns_nan_labels = self.dataset.nan_labels_dict
types = (
dataset.str_columns,
dataset.date_columns,
dataset.int_columns,
dataset.float_columns,
dataset.binary_columns,
dataset.categ_columns,
dataset.long_text_columns,
dataset.email_columns,
self.dataset.str_columns,
self.dataset.date_columns,
self.dataset.int_columns,
self.dataset.float_columns,
self.dataset.binary_columns,
self.dataset.categ_columns,
self.dataset.long_text_columns,
self.dataset.email_columns,
)

# eliminate keys columns from the report
keys_columns = set(dataset.pk_columns) | set(dataset.fk_columns) | set(dataset.uq_columns)
keys_columns = (
set(self.dataset.pk_columns) |
set(self.dataset.fk_columns) |
set(self.dataset.uq_columns)
)
types = tuple(columns - keys_columns for columns in types)

return types
Expand All @@ -69,15 +75,14 @@ def preprocess_data(self):
Return original data, synthetic data, float columns, integer columns, categorical columns
without keys columns
"""
types = self.fetch_data_types()
original, synthetic = self._extract_report_data()
missing_columns = set(original) - set(synthetic)
for col in missing_columns:
synthetic[col] = np.nan
columns_nan_labels = get_nan_labels(original)
original = nan_labels_to_float(original, columns_nan_labels)
synthetic = nan_labels_to_float(synthetic, columns_nan_labels)
dataset = fetch_config(self.paths["dataset_pickle_path"])
types = self.fetch_data_types()
exclude_columns = self.dataset.uuid_columns
original = nan_labels_to_float(original, self.columns_nan_labels, exclude_columns)
synthetic = nan_labels_to_float(synthetic, self.columns_nan_labels, exclude_columns)
(
str_columns,
date_columns,
Expand All @@ -91,8 +96,8 @@ def preprocess_data(self):

original = original[[col for col in original.columns if col in set().union(*types)]]
synthetic = synthetic[[col for col in synthetic.columns if col in set().union(*types)]]
na_values = dataset.format.get("na_values", [])
for date_col, date_format in dataset.date_mapping.items():
na_values = self.dataset.format.get("na_values", [])
for date_col, date_format in self.dataset.date_mapping.items():
original[date_col] = self.convert_to_timestamp(
original, date_col, date_format, na_values
)
Expand Down Expand Up @@ -240,7 +245,13 @@ def report(self):
categ_columns,
date_columns,
) = self.preprocess_data()
accuracy_test = AccuracyTest(original, synthetic, self.paths, self.table_name, self.config)
accuracy_test = AccuracyTest(
original,
synthetic,
self.paths,
self.table_name,
self.config,
)
accuracy_test.report(
cont_columns=list(float_columns | int_columns),
categ_columns=list(categ_columns),
Expand Down Expand Up @@ -273,7 +284,11 @@ def report(self):
date_columns,
) = self.preprocess_data()
accuracy_test = SampleAccuracyTest(
original, sampled, self.paths, self.table_name, self.config
original,
sampled,
self.paths,
self.table_name,
self.config,
)
accuracy_test.report(
cont_columns=list(float_columns | int_columns),
Expand Down
30 changes: 19 additions & 11 deletions src/syngen/ml/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import sys
import re
from typing import List, Dict, Optional, Union
from typing import List, Dict, Optional, Union, Tuple
from dateutil import parser
from datetime import datetime, timedelta

Expand Down Expand Up @@ -197,25 +197,33 @@ def get_nan_labels(df: pd.DataFrame) -> dict:
if (float_val is not None) and (not np.isnan(float_val)) and len(str_values) == 1:
nan_label = str_values[0]
columns_nan_labels[column] = nan_label
elif (float_val is not None) and (not np.isnan(float_val)) and not str_values:
columns_nan_labels[column] = None

return columns_nan_labels


def nan_labels_to_float(df: pd.DataFrame, columns_nan_labels: dict) -> pd.DataFrame:
def nan_labels_to_float(
df: pd.DataFrame,
columns_nan_labels: dict,
exclude_columns: set = set()
) -> pd.DataFrame:
"""
Replace str nan labels in float/int columns with actual np.nan
Replace str nan labels in float/int columns with actual np.NaN
and casting the column to float type.
Args:
df (pd.DataFrame): table data
Returns:
pd.DataFrame: DataFrame with str NaN labels in float/int columns replaced with np.nan
"""
df_with_nan = df.copy()
for column, label in columns_nan_labels.items():
df_with_nan[column].replace(label, np.NaN, inplace=True)
df_with_nan[column] = df_with_nan[column].astype(float)
if column not in exclude_columns:
df_with_nan[column].replace(label, np.NaN, inplace=True)
df_with_nan[column] = df_with_nan[column].astype(float)

logger.info(
f"Column '{column}' contains unique "
f"non-numeric value: '{label}'. "
"It will be treated as null label "
"and replaced with nulls."
)

return df_with_nan

Expand Down
Loading

0 comments on commit 65a93de

Please sign in to comment.