import pandas as pd
import numpy as np
from minieda import summarize
import sdv
from sdv.metadata import Metadata
from sdv.single_table import CopulaGANSynthesizer
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import run_diagnostic

from sdmetrics.single_table import NewRowSynthesis
from sdmetrics.single_table import DCRBaselineProtection

from sdv.sampling import Condition

import matplotlib.pyplot as plt
import os
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print(pd.__version__)
print(sdv.__version__)

2.2.3
1.34.2

# Function for printing tables
def style_table(df, caption, precision=1, caption_side="bottom", font_size="16px", font_weight="normal"):
    return (df.style
            .format(precision=precision).set_caption(caption)
            .set_table_styles([{"selector": "caption", 
                                "props": [("caption-side", caption_side),
                                          ("font-size", font_size),
                                          ("font-weight", font_weight)]}]))

data_schema = {
    'age':              'int64',      # client age
    'job':              'category',   # type of job (admin, blue-collar, entrepreneur, housemaid, management, retired, self-employed, services, student, technician, unemployed, unknown)
    'marital':          'category',   # marital status (divorced, married, single, unknown)
    'education':        'category',   # education level (basic.4y, basic.6y, basic.9y, high.school, illiterate, professional.course, university.degree, unknown)
    'default':          'category',   # has credit in default? (yes, no, unknown)
    'housing':          'category',   # has housing loan? (yes, no, unknown)
    'loan':             'category',   # has personal loan? (yes, no, unknown)
    'contact':          'category',   # contact communication type (cellular, telephone)
    'month':            'category',   # last contact month of year (jan–dec)
    'day_of_week':      'category',   # last contact day of the week (mon, tue, wed, thu, fri)
    'duration':         'int64',      # last contact duration in seconds — benchmark only, not for predictive modeling
    'campaign':         'int64',      # number of contacts performed during this campaign (includes last contact)
    'pdays':            'int64',      # days since client was last contacted from previous campaign (999 = not previously contacted)
    'previous':         'int64',      # number of contacts performed before this campaign
    'poutcome':         'category',   # outcome of previous marketing campaign (failure, nonexistent, success)
    'emp.var.rate':     'float64',    # employment variation rate — quarterly indicator
    'cons.price.idx':   'float64',    # consumer price index — monthly indicator
    'cons.conf.idx':    'float64',    # consumer confidence index — monthly indicator
    'euribor3m':        'float64',    # euribor 3 month rate — daily indicator
    'nr.employed':      'float64',    # number of employees — quarterly indicator
    'y':                'category',   # target: has the client subscribed a term deposit? (yes, no)
}

# Load data with schema
data = pd.read_csv('data/bank-additional.csv', sep=";", dtype=data_schema)

# Convert yes/no columns to True/False
bool_cols = ['y']
for col in bool_cols:
    data[col] = data[col].map({'yes': True, 'no': False})

summarize(data)

metadata = Metadata.detect_from_dataframe(data=data, table_name='bank_marketing')

# CopulaGAN requires categorical columns to be `object` dtype instead of pandas' `category`
data_copulagan = data.copy()
cat_cols = data.select_dtypes('category').columns
data_copulagan[cat_cols] = data_copulagan[cat_cols].astype(object)

# Instantiate and fit the model
synthesizer = CopulaGANSynthesizer(metadata, epochs=150, verbose=True)
synthesizer.fit(data_copulagan)

# Generate synthetic data, with the same number of rows as the original dataset
synthetic_data = synthesizer.sample(num_rows=data_copulagan.shape[0])

# Restore category dtype to match real data (CopulaGAN outputs object dtype)
synthetic_data[cat_cols] = synthetic_data[cat_cols].astype('category')

print(synthetic_data.shape)

Gen. (-00.64) | Discrim. (-00.49): 100%|█████████████████████████████████████████████| 150/150 [01:39<00:00,  1.51it/s]

(4119, 21)

synthetic_data.head()

summarize(synthetic_data)

categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                    'contact', 'month', 'day_of_week', 'poutcome']
boolean_cols     = ['y']
numeric_cols     = ['age', 'duration', 'campaign', 'pdays', 'previous',
                    'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
                    'euribor3m', 'nr.employed']

all_cols = categorical_cols + boolean_cols + numeric_cols
n_cols   = len(all_cols)

# Set up grid
n_grid_cols = 3
n_grid_rows = int(np.ceil(n_cols / n_grid_cols))

fig, axes = plt.subplots(n_grid_rows, n_grid_cols, figsize=(18, n_grid_rows * 4))
axes = axes.flatten()

for i, col in enumerate(all_cols):
    ax = axes[i]

    if col in numeric_cols:
        # Overlapping KDE plots for numeric columns
        data[col].plot.kde(ax=ax, label='Real', color='steelblue', linewidth=2)
        synthetic_data[col].plot.kde(ax=ax, label='Synthetic', color='coral',
                                      linewidth=2, linestyle='--')
        ax.set_title(col, fontsize=13)
        ax.legend()
        ax.tick_params(axis='both', labelsize=11)

    else:
        # Grouped bar chart for categorical and boolean columns
        real_counts = data[col].value_counts(normalize=True).sort_index()
        synth_counts = synthetic_data[col].value_counts(normalize=True).sort_index()

        # Align categories in case synthetic has different order
        all_categories = sorted(set(real_counts.index) | set(synth_counts.index),
                                key=str)
        real_counts  = real_counts.reindex(all_categories, fill_value=0)
        synth_counts = synth_counts.reindex(all_categories, fill_value=0)

        x = np.arange(len(all_categories))
        width = 0.35

        ax.bar(x - width/2, real_counts.values,  width, label='Real',
               color='steelblue', alpha=0.8)
        ax.bar(x + width/2, synth_counts.values, width, label='Synthetic',
               color='coral', alpha=0.8)

        ax.set_xticks(x)
        ax.set_xticklabels(all_categories, rotation=45, ha='right', fontsize=11)
        ax.set_title(col, fontsize=13)
        ax.legend()

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Per-Variable Distribution: Real vs Synthetic', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

real_stats  = data[numeric_cols].describe().T[['mean', 'std', 'min', 'max']]
synth_stats = synthetic_data[numeric_cols].describe().T[['mean', 'std', 'min', 'max']]

real_stats.columns  = ['real_mean', 'real_std', 'real_min', 'real_max']
synth_stats.columns = ['synth_mean', 'synth_std', 'synth_min', 'synth_max']

summary = pd.concat([real_stats, synth_stats], axis=1)
style_table(summary.round(3), caption="Numeric Column Statistics: Real vs Synthetic")

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

real_corr  = data[numeric_cols].corr()
synth_corr = synthetic_data[numeric_cols].corr()

# Shared color scale
vmin, vmax = -1, 1

for ax, corr, title in zip(axes, [real_corr, synth_corr], ['Real Data', 'Synthetic Data']):
    im = ax.imshow(corr, cmap='coolwarm', vmin=vmin, vmax=vmax, aspect='auto')
    ax.set_xticks(range(len(numeric_cols)))
    ax.set_yticks(range(len(numeric_cols)))
    ax.set_xticklabels(numeric_cols, rotation=45, ha='right', fontsize=11)
    ax.set_yticklabels(numeric_cols, fontsize=11)
    ax.set_title(title, fontsize=14)

    # Annotate cells with correlation values
    for i in range(len(numeric_cols)):
        for j in range(len(numeric_cols)):
            ax.text(j, i, f'{corr.iloc[i, j]:.2f}',
                    ha='center', va='center', fontsize=8,
                    color='white' if abs(corr.iloc[i, j]) > 0.6 else 'black')

# Dedicated colorbar axis
cbar_ax = fig.add_axes([0.92, 0.18, 0.02, 0.7])  # [left, bottom, width, height]
fig.colorbar(im, cax=cbar_ax)

plt.suptitle('Correlation Matrix: Real vs Synthetic', fontsize=16)
plt.tight_layout(rect=[0, 0, 0.91, 1])
plt.show()

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    verbose=False
)

pair_trends = quality_report.get_details(property_name='Column Pair Trends')
pair_trends_nona = pair_trends.dropna(subset=['Score'])

# Split into correlation (numeric-numeric) and contingency (categorical) pairs
correlation_pairs = pair_trends_nona[pair_trends_nona['Metric'] == 'CorrelationSimilarity']
contingency_pairs = pair_trends_nona[pair_trends_nona['Metric'] == 'ContingencySimilarity']

style_table(correlation_pairs[['Column 1', 'Column 2', 'Score', 'Real Correlation', 'Synthetic Correlation']],
            caption="Numeric-Numeric Pairs (Correlation Similarity)")

style_table(contingency_pairs[['Column 1', 'Column 2', 'Score', 'Real Association']],
            caption="Categorical/Mixed Pairs (Contingency Similarity)")

print(f"Numeric-Numeric Pairs Mean Score: {correlation_pairs['Score'].mean():.3f}")
print(f"Categorical/Mixed Pairs Mean Score: {contingency_pairs['Score'].mean():.3f}")

Numeric-Numeric Pairs Mean Score: 0.798
Categorical/Mixed Pairs Mean Score: 0.644

quality_report.get_details(property_name='Column Shapes')

metadata_dict = metadata.to_dict()['tables']['bank_marketing']

def to_str_categories(df):
    return df.copy().assign(**{
        col: df[col].astype(str) 
        for col in df.select_dtypes('category').columns
    })

new_row_score = NewRowSynthesis.compute(
    real_data=to_str_categories(data),
    synthetic_data=to_str_categories(synthetic_data),
    metadata=metadata_dict,
    numerical_match_tolerance=0.01,
    synthetic_sample_size=None
)

nn_privacy_score = DCRBaselineProtection.compute_breakdown(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata_dict
)

print(f"New Row Score: {new_row_score:.3f}")
print(f"Nearest-Neighbor Privacy Score: {nn_privacy_score['score']:.3f}")
print(f"Synthetic median DCR to real data: {nn_privacy_score['median_DCR_to_real_data']['synthetic_data']:.3f}")
print(f"Random baseline median DCR to real data: {nn_privacy_score['median_DCR_to_real_data']['random_data_baseline']:.3f}")

New Row Score: 1.000
Nearest-Neighbor Privacy Score: 0.417
Synthetic median DCR to real data: 0.148
Random baseline median DCR to real data: 0.354

diagnostic_report = run_diagnostic(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    verbose=True
)

Generating report ...

(1/2) Evaluating Data Validity: |███████████████████████████████████████████████████| 21/21 [00:00<00:00, 1613.70it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 333.41it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

summary = {
    'Column Shapes':                    quality_report.get_properties().set_index('Property').loc['Column Shapes', 'Score'],
    'Column Pair Trends (numeric)':     correlation_pairs['Score'].mean(),
    'Column Pair Trends (categorical)': contingency_pairs['Score'].mean(),
    'Data Validity':                    diagnostic_report.get_properties().set_index('Property').loc['Data Validity', 'Score'],
    'Data Structure':                   diagnostic_report.get_properties().set_index('Property').loc['Data Structure', 'Score'],
    'New Row Score':                    new_row_score,
    'Nearest-Neighbor Privacy':         nn_privacy_score['score'],
}

summary_df = pd.DataFrame.from_dict(summary, orient='index', columns=['Score'])
style_table(summary_df, caption='Synthetic Data Quality Summary', precision=2)

os.makedirs('outputs', exist_ok=True)

# Save synthetic data to CSV
synthetic_data.to_csv('outputs/synthetic_data.csv', index=False)

# Save the fitted synthesizer for reproducibility
synthesizer.save('outputs/copulagan_synthesizer.pkl')

# Save metadata to JSON
metadata.save_to_json('outputs/metadata.json', mode='overwrite')

# Basic example: generate 200 rows for young clients (age 25)
young_condition = Condition(
    num_rows=200,
    column_values={'age': 25}
)
young_synthetic = synthesizer.sample_from_conditions(conditions=[young_condition])
young_synthetic[cat_cols] = young_synthetic[cat_cols].astype('category') # Restore category dtype to match real data (CopulaGAN outputs object dtype)
print(young_synthetic.shape)
young_synthetic.head()

Sampling conditions: 100%|███████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 48.81it/s]

(200, 21)

# Complex example: generate 200 rows for retired clients, contacted by cellular in May, with no previous campaign contact
complex_condition = Condition(
    num_rows=200,
    column_values={
        'job':      'retired',
        'contact':  'cellular',
        'month':    'may',
        'poutcome': 'nonexistent',
        'previous': 0
    }
)
complex_synthetic = synthesizer.sample_from_conditions(conditions=[complex_condition])
complex_synthetic[cat_cols] = complex_synthetic[cat_cols].astype('category') # Restore category dtype to match real data (CopulaGAN outputs object dtype)
print(complex_synthetic.shape)
complex_synthetic.head()

Sampling conditions: 100%|███████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 28.25it/s]

(200, 21)

summarize(complex_synthetic)

	dtype	count	unique	unique_pct	zero	zero_pct	top	freq	mean	std	min	50%	max	skew
age	int64	4119	67	1.63	0	0.00			40.11	10.31	18.0	38.0	88.0	0.72
campaign	int64	4119	25	0.61	0	0.00			2.54	2.57	1.0	2.0	35.0	4.0
nr.employed	float64	4119	11	0.27	0	0.00			5166.48	73.67	4963.6	5191.0	5228.1	-1.08
euribor3m	float64	4119	234	5.68	0	0.00			3.62	1.73	0.64	4.86	5.04	-0.72
cons.conf.idx	float64	4119	26	0.63	0	0.00			-40.5	4.59	-50.8	-41.8	-26.9	0.29
cons.price.idx	float64	4119	26	0.63	0	0.00			93.58	0.58	92.2	93.75	94.77	-0.22
emp.var.rate	float64	4119	10	0.24	0	0.00			0.08	1.56	-3.4	1.1	1.4	-0.73
previous	int64	4119	7	0.17	3523	85.53			0.19	0.54	0.0	0.0	6.0	4.02
pdays	int64	4119	21	0.51	2	0.05			960.42	191.92	0.0	999.0	999.0	-4.78
duration	int64	4119	828	20.10	1	0.02			256.79	254.7	0.0	181.0	3643.0	3.29
job	category	4119	12	0.29	0	0.00	admin.	1012
day_of_week	category	4119	5	0.12	0	0.00	thu	860
month	category	4119	10	0.24	0	0.00	may	1378
contact	category	4119	2	0.05	0	0.00	cellular	2652
poutcome	category	4119	3	0.07	0	0.00	nonexistent	3523
loan	category	4119	3	0.07	0	0.00	no	3349
housing	category	4119	3	0.07	0	0.00	yes	2175
default	category	4119	3	0.07	0	0.00	no	3315
education	category	4119	8	0.19	0	0.00	university.degree	1264
marital	category	4119	4	0.10	0	0.00	married	2509
y	category	4119	2	0.05	0	0.00	False	3668

	dtype	count	unique	unique_pct	zero	zero_pct	top	freq	mean	std	min	50%	max	skew
age	int64	4119	68	1.65	0	0.00			47.81	14.07	20.0	46.0	88.0	0.73
campaign	int64	4119	33	0.80	0	0.00			3.16	4.15	1.0	2.0	34.0	3.75
nr.employed	float64	4119	1453	35.28	0	0.00			5142.09	86.9	4963.6	5178.9	5228.1	-0.72
euribor3m	float64	4119	1462	35.49	0	0.00			3.36	1.73	0.64	4.31	4.99	-0.36
cons.conf.idx	float64	4119	221	5.37	0	0.00			-39.34	5.6	-50.8	-41.3	-26.9	0.56
cons.price.idx	float64	4119	1392	33.79	0	0.00			93.56	0.59	92.2	93.49	94.77	-0.03
emp.var.rate	float64	4119	49	1.19	19	0.46			-0.15	1.65	-3.4	0.7	1.4	-0.55
previous	int64	4119	7	0.17	3090	75.02			0.41	0.92	0.0	0.0	6.0	3.2
pdays	int64	4119	190	4.61	59	1.43			858.2	336.89	0.0	999.0	999.0	-1.99
duration	int64	4119	1240	30.10	0	0.00			482.64	546.12	4.0	307.0	3643.0	2.83
job	category	4119	12	0.29	0	0.00	blue-collar	889
day_of_week	category	4119	5	0.12	0	0.00	tue	1166
month	category	4119	10	0.24	0	0.00	may	920
contact	category	4119	2	0.05	0	0.00	cellular	3016
poutcome	category	4119	3	0.07	0	0.00	nonexistent	3178
loan	category	4119	3	0.07	0	0.00	no	2695
housing	category	4119	3	0.07	0	0.00	yes	2561
default	category	4119	3	0.07	0	0.00	no	3186
education	category	4119	8	0.19	0	0.00	high.school	1153
marital	category	4119	4	0.10	0	0.00	married	1927
y	category	4119	2	0.05	0	0.00	False	3168

	real_mean	real_std	real_min	real_max	synth_mean	synth_std	synth_min	synth_max
age	40.1	10.3	18.0	88.0	47.8	14.1	20.0	88.0
duration	256.8	254.7	0.0	3643.0	482.6	546.1	4.0	3643.0
campaign	2.5	2.6	1.0	35.0	3.2	4.1	1.0	34.0
pdays	960.4	191.9	0.0	999.0	858.2	336.9	0.0	999.0
previous	0.2	0.5	0.0	6.0	0.4	0.9	0.0	6.0
emp.var.rate	0.1	1.6	-3.4	1.4	-0.1	1.6	-3.4	1.4
cons.price.idx	93.6	0.6	92.2	94.8	93.6	0.6	92.2	94.8
cons.conf.idx	-40.5	4.6	-50.8	-26.9	-39.3	5.6	-50.8	-26.9
euribor3m	3.6	1.7	0.6	5.0	3.4	1.7	0.6	5.0
nr.employed	5166.5	73.7	4963.6	5228.1	5142.1	86.9	4963.6	5228.1

	Column	Metric	Score
0	age	KSComplement	0.764020
1	job	TVComplement	0.879825
2	marital	TVComplement	0.858704
3	education	TVComplement	0.866472
4	default	TVComplement	0.968682
5	housing	TVComplement	0.873513
6	loan	TVComplement	0.841224
7	contact	TVComplement	0.911629
8	month	TVComplement	0.871328
9	day_of_week	TVComplement	0.837096
10	duration	KSComplement	0.749697
11	campaign	KSComplement	0.932993
12	pdays	KSComplement	0.889051
13	previous	TVComplement	0.894877
14	poutcome	TVComplement	0.916242
15	emp.var.rate	KSComplement	0.864530
16	cons.price.idx	KSComplement	0.817674
17	cons.conf.idx	KSComplement	0.815246
18	euribor3m	KSComplement	0.837825
19	nr.employed	KSComplement	0.738043
20	y	TVComplement	0.878611

Synthetic Data Generation Workflow - Bank Marketing Data¶

Data Description¶

Data Loading¶

Synthetic Data Generation¶

Synthetic Data Validation¶

Per-variable Distribution Comparison¶

Correlation Structure¶

SDV Quality Scores¶

Privacy and Memorization Checks¶

SDV Validity Checks¶

Synthetic Data Quality Summary¶

Save Outputs¶

Conditional Generation¶

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	previous	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
0	49	technician	single	university.degree	unknown	no	no	cellular	jul	mon	...	7	999	0	nonexistent	1.4	94.500	-42.3	4.967	5228.1	False
1	50	blue-collar	married	basic.4y	no	yes	no	telephone	jun	mon	...	2	999	0	nonexistent	1.4	93.950	-36.3	4.145	5228.1	False
2	50	blue-collar	single	basic.6y	no	yes	no	cellular	jul	tue	...	6	999	0	nonexistent	1.2	93.546	-35.9	4.955	5228.1	False
3	36	technician	married	basic.9y	no	yes	yes	cellular	jul	wed	...	4	999	0	nonexistent	1.4	94.097	-36.7	4.980	5228.1	False
4	88	admin.	single	university.degree	no	yes	no	cellular	apr	thu	...	1	4	1	failure	-1.7	93.007	-26.9	1.085	5007.6	True

	Score
Column Shapes	0.86
Column Pair Trends (numeric)	0.80
Column Pair Trends (categorical)	0.64
Data Validity	1.00
Data Structure	1.00
New Row Score	1.00
Nearest-Neighbor Privacy	0.42

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
0	33	retired	married	basic.9y	no	no	no	cellular	may	tue	...	1	999	nonexistent	0.6	94.559	-35.9	4.453	5228.1	False
1	50	retired	married	basic.9y	unknown	yes	no	cellular	may	fri	...	8	999	nonexistent	0.1	94.122	-41.8	4.915	5194.5	False
2	61	retired	married	professional.course	no	yes	yes	cellular	may	tue	...	1	999	nonexistent	1.4	93.980	-47.2	1.141	4989.6	True
3	44	retired	married	university.degree	no	no	yes	cellular	may	wed	...	2	999	nonexistent	1.4	93.095	-42.7	4.966	5228.1	False
4	45	retired	married	unknown	unknown	yes	no	cellular	may	fri	...	1	999	nonexistent	1.4	94.059	-42.9	4.890	5191.3	False

	dtype	count	unique	unique_pct	zero	zero_pct	top	freq	mean	std	min	50%	max	skew
age	int64	200	47	23.5	0	0.0			46.22	12.19	24.0	45.0	88.0	0.92
campaign	int64	200	15	7.5	0	0.0			2.91	3.96	1.0	2.0	30.0	4.49
nr.employed	float64	200	139	69.5	0	0.0			5138.65	87.1	4963.6	5177.45	5228.1	-0.79
euribor3m	float64	200	168	84.0	0	0.0			3.46	1.67	0.64	4.32	4.99	-0.53
cons.conf.idx	float64	200	80	40.0	0	0.0			-40.49	4.91	-50.8	-41.75	-26.9	0.74
cons.price.idx	float64	200	182	91.0	0	0.0			93.51	0.59	92.2	93.46	94.77	-0.08
emp.var.rate	float64	200	46	23.0	3	1.5			-0.45	1.73	-3.4	-0.4	1.4	-0.37
previous	int64	200	1	0.5	200	100.0			0.0	0.0	0.0	0.0	0.0	0.0
pdays	int64	200	6	3.0	1	0.5			974.51	153.37	0.0	999.0	999.0	-6.14
duration	int64	200	183	91.5	0	0.0			492.77	531.43	5.0	327.0	3643.0	2.69
job	category	200	1	0.5	0	0.0	retired	200
day_of_week	category	200	5	2.5	0	0.0	wed	65
month	category	200	1	0.5	0	0.0	may	200
contact	category	200	1	0.5	0	0.0	cellular	200
poutcome	category	200	1	0.5	0	0.0	nonexistent	200
loan	category	200	3	1.5	0	0.0	no	123
housing	category	200	3	1.5	0	0.0	yes	133
default	category	200	3	1.5	0	0.0	no	148
education	category	200	8	4.0	0	0.0	high.school	51
marital	category	200	4	2.0	0	0.0	married	91
y	category	200	2	1.0	0	0.0	False	164