from aindo.rdml.eval import compute_privacy_stats, report
from aindo.rdml.relational import Column, ForeignKey, PrimaryKey, RelationalData, Schema, Table
from aindo.rdml.synth import (
def preproc_data ( df : pd.DataFrame, schema : Schema ) -> dict[ str , pd.DataFrame]:
""" Split the Airbnb dataset in two tables: host (parent) and listings (child). """
" host " : df.loc[:, list ( schema.tables [ " host " ] .all_columns )]. drop_duplicates (),
" listings " : df.loc[:, list ( schema.tables [ " listings " ] .all_columns )],
def postproc_data ( data : RelationalData ) -> pd.DataFrame:
""" Join the host and listings tables along the foreign key to recover the original format of the Airbnb dataset. """
return data[ " host " ]. merge ( data [ " listings " ] , on = " host_id " )
device : str | torch.device | None ,
# Load data and define schema
df = pd. read_csv ( data_dir / " airbnb.csv " )
calculated_host_listings_count = Column.NUMERIC ,
host_id = ForeignKey ( parent = " host " ) ,
neighbourhood_group = Column.CATEGORICAL ,
neighbourhood = Column.CATEGORICAL ,
longitude = Column.NUMERIC ,
room_type = Column.CATEGORICAL ,
minimum_nights = Column.INTEGER ,
number_of_reviews = Column.INTEGER ,
last_review = Column.DATETIME ,
reviews_per_month = Column.NUMERIC ,
availability_365 = Column.INTEGER ,
data = preproc_data ( df = df , schema = schema )
data = RelationalData ( data = data , schema = schema )
_, data = data. split ( ratio = 0.2 )
preproc = TabularPreproc. from_schema ( schema = schema ). fit ( data = data )
preproc_text_host = TextPreproc. from_tabular ( preproc = preproc , table = " host " ). fit ( data = data )
preproc_text_listings = TextPreproc. from_tabular ( preproc = preproc , table = " listings " ). fit ( data = data )
data_train_valid, data_test = data. split ( ratio = split_ratio )
data_train, data_valid = data_train_valid. split ( ratio = split_ratio )
dataset_train = TabularDataset. from_data ( data = data_train , preproc = preproc , on_disk = True )
dataset_valid = TabularDataset. from_data ( data = data_valid , preproc = preproc , on_disk = True )
dataset_text_host_train = TextDataset. from_data ( data = data_train , preproc = preproc_text_host , on_disk = True )
dataset_text_host_valid = TextDataset. from_data ( data = data_valid , preproc = preproc_text_host , on_disk = True )
dataset_text_listings_train = TextDataset. from_data ( data = data_train , preproc = preproc_text_listings , on_disk = True )
dataset_text_listings_valid = TextDataset. from_data ( data = data_valid , preproc = preproc_text_listings , on_disk = True )
model = TabularModel. build ( preproc = preproc , size = " tiny " if quick else " small " )
model.device = device # Device to None means it will be set to CUDA if the latter is available, otherwise CPU
model_text_host = TextModel. build (
preproc = preproc_text_host ,
size = " tiny " if quick else " normal " ,
block_size = max ( dataset_text_host_train.max_text_len , dataset_text_host_valid.max_text_len ) ,
model_text_host.device = device
model_text_listings = TextModel. build (
preproc = preproc_text_listings ,
size = " tiny " if quick else " normal " ,
block_size = max ( dataset_text_listings_train.max_text_len , dataset_text_listings_valid.max_text_len ) ,
model_text_listings.device = device
TabularTrainer ( model = model ). train (
save_best = output_dir / " ckpt " / " tabular.pt " ,
tensorboard = output_dir / " tb " / " tabular " ,
TextTrainer ( model = model_text_host ). train (
dataset = dataset_text_host_train ,
dataset = dataset_text_host_valid ,
save_best = output_dir / " ckpt " / " text_host.pt " ,
tensorboard = output_dir / " tb " / " text_host " ,
TextTrainer ( model = model_text_listings ). train (
dataset = dataset_text_listings_train ,
dataset = dataset_text_listings_valid ,
save_best = output_dir / " ckpt " / " text_listings.pt " ,
tensorboard = output_dir / " tb " / " text_listings " ,
# Generate synthetic data
data_synth = model. generate (
n_samples = data [ " host " ] .shape [ 0 ] ,
data_synth = model_text_host. generate (
data_synth = model_text_listings. generate (
synth_dir = output_dir / " synth "
data_synth. to_csv ( synth_dir , escapechar = " \\ " )
# Revert to the original form
df_synth = postproc_data ( data = data_synth ).loc[:, df.columns]
df_synth. to_csv ( synth_dir / " airbnb.csv " , index = False , escapechar = " \\ " )
# Compute and print PDF report
path = output_dir / " report.pdf " ,
# Compute extra privacy stats and print some results
privacy_stats = compute_privacy_stats (
" privacy_score " : ps.privacy_score,
" privacy_score_std " : ps.privacy_score_std,
" %_points_at_risk " : ps.risk * 100 ,
for t, ps in privacy_stats. items ()
with open ( output_dir / " privacy_stats.json " , mode = " w " , encoding = " utf-8 " ) as f:
json. dump ( privacy_stats_out , f )
if __name__ == " __main__ " :
parser = argparse. ArgumentParser ()
parser. add_argument ( " data_dir " , type = Path , help = " The directory were to find the 'airbnb' dataset " )
parser. add_argument ( " output_dir " , type = Path , help = " The output directory " )
help = " Training epochs (or steps if the --steps flag is used) " ,
parser. add_argument ( " --steps " , " -s " , action = " store_true " , help = " Use steps instead of epochs " )
parser. add_argument ( " --valid-each " , " -v " , type = int , default = 200 , help = " # steps between validations " )
parser. add_argument ( " --device " , " -g " , default = None , help = " Training device " )
parser. add_argument ( " --memory " , " -m " , type = int , default = 4096 , help = " Available memory (MB) " )
" --quick " , " -q " , action = " store_true " , help = " Perform a quick test run, with reduced data and small models "
args = parser. parse_args ()
output_dir = args.output_dir ,
n_epochs = None if args.steps else args.n ,
n_steps = args.n if args.steps else None ,
valid_each = args.valid_each ,