from aindo.rdml.eval import compute_privacy_stats, report
from aindo.rdml.relational import Column, ForeignKey, PrimaryKey, RelationalData, Schema, Table
from aindo.rdml.synth import (
def preproc_data ( df : pd.DataFrame, schema : Schema ) -> dict[ str , pd.DataFrame]:
""" Split the Airbnb dataset in two tables: host (parent) and listings (child). """
' host ' : df.loc[:, list ( schema.tables [ ' host ' ] .all_columns )]. drop_duplicates (),
' listings ' : df.loc[:, list ( schema.tables [ ' listings ' ] .all_columns )],
def postproc_data ( data : RelationalData ) -> pd.DataFrame:
""" Join the host and listings tables along the foreign key to recover the original format of the Airbnb dataset. """
return data[ ' host ' ]. merge ( data [ ' listings ' ] , on = ' host_id ' )
device : str | torch.device | None ,
# Load data and define schema
df = pd. read_csv ( data_dir / ' airbnb.csv ' )
calculated_host_listings_count = Column.NUMERIC ,
host_id = ForeignKey ( parent = ' host ' ) ,
neighbourhood_group = Column.CATEGORICAL ,
neighbourhood = Column.CATEGORICAL ,
longitude = Column.NUMERIC ,
room_type = Column.CATEGORICAL ,
minimum_nights = Column.INTEGER ,
number_of_reviews = Column.INTEGER ,
last_review = Column.DATETIME ,
reviews_per_month = Column.NUMERIC ,
availability_365 = Column.INTEGER ,
data = preproc_data ( df = df , schema = schema )
data = RelationalData ( data = data , schema = schema )
_, data = data. split ( ratio = 0.2 )
preproc = TabularPreproc. from_schema ( schema = schema ). fit ( data = data )
preproc_text_host = TextPreproc. from_tabular ( preproc = preproc , table = ' host ' ). fit ( data = data )
preproc_text_listings = TextPreproc. from_tabular ( preproc = preproc , table = ' listings ' ). fit ( data = data )
data_train_valid, data_test = data. split ( ratio = split_ratio )
data_train, data_valid = data_train_valid. split ( ratio = split_ratio )
dataset_train = TabularDataset. from_data ( data = data_train , preproc = preproc , on_disk = True )
dataset_valid = TabularDataset. from_data ( data = data_valid , preproc = preproc , on_disk = True )
dataset_text_host_train = TextDataset. from_data ( data = data_train , preproc = preproc_text_host , on_disk = True )
dataset_text_host_valid = TextDataset. from_data ( data = data_valid , preproc = preproc_text_host , on_disk = True )
dataset_text_listings_train = TextDataset. from_data ( data = data_train , preproc = preproc_text_listings , on_disk = True )
dataset_text_listings_valid = TextDataset. from_data ( data = data_valid , preproc = preproc_text_listings , on_disk = True )
model = TabularModel. build ( preproc = preproc , size = ' tiny ' if quick else ' small ' )
model.device = device # Device to None means it will be set to CUDA if the latter is available, otherwise CPU
model_text_host = TextModel. build (
preproc = preproc_text_host ,
size = ' tiny ' if quick else ' normal ' ,
block_size = max ( dataset_text_host_train.max_text_len , dataset_text_host_valid.max_text_len ) ,
model_text_host.device = device
model_text_listings = TextModel. build (
preproc = preproc_text_listings ,
size = ' tiny ' if quick else ' normal ' ,
block_size = max ( dataset_text_listings_train.max_text_len , dataset_text_listings_valid.max_text_len ) ,
model_text_listings.device = device
TabularTrainer ( model = model ). train (
save_best = output_dir / ' ckpt ' / ' tabular.pt ' ,
tensorboard = output_dir / ' tb ' / ' tabular ' ,
TextTrainer ( model = model_text_host ). train (
dataset = dataset_text_host_train ,
dataset = dataset_text_host_valid ,
save_best = output_dir / ' ckpt ' / ' text_host.pt ' ,
tensorboard = output_dir / ' tb ' / ' text_host ' ,
TextTrainer ( model = model_text_listings ). train (
dataset = dataset_text_listings_train ,
dataset = dataset_text_listings_valid ,
save_best = output_dir / ' ckpt ' / ' text_listings.pt ' ,
tensorboard = output_dir / ' tb ' / ' text_listings ' ,
# Generate synthetic data
data_synth = model. generate (
n_samples = data [ ' host ' ] .shape [ 0 ] ,
data_synth = model_text_host. generate (
data_synth = model_text_listings. generate (
synth_dir = output_dir / ' synth '
data_synth. to_csv ( synth_dir , escapechar = ' \\ ' )
# Revert to the original form
df_synth = postproc_data ( data = data_synth ).loc[:, df.columns]
df_synth. to_csv ( synth_dir / ' airbnb.csv ' , index = False , escapechar = ' \\ ' )
# Compute and print PDF report
path = output_dir / ' report.pdf ' ,
# Compute extra privacy stats and print some results
privacy_stats = compute_privacy_stats (
' privacy_score ' : ps.privacy_score,
' privacy_score_std ' : ps.privacy_score_std,
' %_points_at_risk ' : ps.risk * 100 ,
for t, ps in privacy_stats. items ()
with open ( output_dir / ' privacy_stats.json ' , mode = ' w ' , encoding = ' utf-8 ' ) as f:
json. dump ( privacy_stats_out , f )
if __name__ == ' __main__ ' :
parser = argparse. ArgumentParser ()
parser. add_argument ( ' data_dir ' , type = Path , help = " The directory were to find the 'airbnb' dataset " )
parser. add_argument ( ' output_dir ' , type = Path , help = " The output directory " )
' --n ' , ' -n ' , type = int , default = 1000 ,
help = " Training epochs (or steps if the --steps flag is used) " ,
parser. add_argument ( ' --steps ' , ' -s ' , action = ' store_true ' , help = " Use steps instead of epochs " )
parser. add_argument ( ' --valid-each ' , ' -v ' , type = int , default = 200 , help = " # steps between validations " )
parser. add_argument ( ' --device ' , ' -g ' , default = None , help = " Training device " )
parser. add_argument ( ' --memory ' , ' -m ' , type = int , default = 4096 , help = " Available memory (MB) " )
' --quick ' , ' -q ' , action = ' store_true ' ,
help = " Perform a quick test run, with reduced data and small models "
args = parser. parse_args ()
output_dir = args.output_dir ,
n_epochs = None if args.steps else args.n ,
n_steps = args.n if args.steps else None ,
valid_each = args.valid_each ,