Multi table with text - Airbnb dataset
In the following we present an example script using the aindo.rdml
library
to generate synthetic data with both tabular and text data.
We make use of the Airbnb Open Data dataset, which in its original form consists of a single table. However, after looking at the content of the dataset columns, we find it natural to rearrange the data into two tables:
- A table
host
, with primary keyhost_id
. - A table
listings
, with primary keyid
and foreign keyhost_id
, referring to the primary key ofhost
.
The columns host_name
and calculated_host_listings_count
are indeed attributes of the host,
and they are constant across all listings belonging to the same host.
The other columns, on the contrary, contain attributes of each particular listing.
In the script, the function preproc_data
takes care of this rearrangement,
while postproc_data
performs the inverse transformation, joining the two tables into a single one.
Two columns of the original dataset should be treated as text columns, host_name
and name
.
After rearranging the data, host_name
belongs to the host
table, while name
belongs to the listings
table.
Since the two columns belong to two different tables, we need to build and train two text models,
on top of the tabular one (used to generate the rest of the tabular data).
import argparseimport jsonfrom pathlib import Path
import pandas as pdimport torch
from aindo.rdml.eval import compute_privacy_stats, reportfrom aindo.rdml.relational import Column, ForeignKey, PrimaryKey, RelationalData, Schema, Tablefrom aindo.rdml.synth import ( TabularDataset, TabularModel, TabularPreproc, TabularTrainer, TextDataset, TextModel, TextPreproc, TextTrainer, Validation,)
def preproc_data(df: pd.DataFrame, schema: Schema) -> dict[str, pd.DataFrame]: """Split the Airbnb dataset in two tables: host (parent) and listings (child).""" return { "host": df.loc[:, list(schema.tables["host"].all_columns)].drop_duplicates(), "listings": df.loc[:, list(schema.tables["listings"].all_columns)], }
def postproc_data(data: RelationalData) -> pd.DataFrame: """Join the host and listings tables along the foreign key to recover the original format of the Airbnb dataset.""" return data["host"].merge(data["listings"], on="host_id")
def example_airbnb( data_dir: Path, output_dir: Path, n_epochs: int | None, n_steps: int | None, valid_each: int, device: str | torch.device | None, memory: int, quick: bool,) -> None: # Load data and define schema df = pd.read_csv(data_dir / "airbnb.csv") schema = Schema( host=Table( host_id=PrimaryKey(), host_name=Column.TEXT, calculated_host_listings_count=Column.NUMERIC, ), listings=Table( id=PrimaryKey(), host_id=ForeignKey(parent="host"), name=Column.TEXT, neighbourhood_group=Column.CATEGORICAL, neighbourhood=Column.CATEGORICAL, latitude=Column.NUMERIC, longitude=Column.NUMERIC, room_type=Column.CATEGORICAL, price=Column.INTEGER, minimum_nights=Column.INTEGER, number_of_reviews=Column.INTEGER, last_review=Column.DATETIME, reviews_per_month=Column.NUMERIC, availability_365=Column.INTEGER, ), ) data = preproc_data(df=df, schema=schema) data = RelationalData(data=data, schema=schema) if quick: _, data = data.split(ratio=0.2)
# Define preprocessors preproc = TabularPreproc.from_schema(schema=schema).fit(data=data) preproc_text_host = TextPreproc.from_tabular(preproc=preproc, table="host").fit(data=data) preproc_text_listings = TextPreproc.from_tabular(preproc=preproc, table="listings").fit(data=data)
# Split data split_ratio = 0.1 data_train_valid, data_test = data.split(ratio=split_ratio) data_train, data_valid = data_train_valid.split(ratio=split_ratio)
# Build datasets dataset_train = TabularDataset.from_data(data=data_train, preproc=preproc, on_disk=True) dataset_valid = TabularDataset.from_data(data=data_valid, preproc=preproc, on_disk=True)
dataset_text_host_train = TextDataset.from_data(data=data_train, preproc=preproc_text_host, on_disk=True) dataset_text_host_valid = TextDataset.from_data(data=data_valid, preproc=preproc_text_host, on_disk=True)
dataset_text_listings_train = TextDataset.from_data(data=data_train, preproc=preproc_text_listings, on_disk=True) dataset_text_listings_valid = TextDataset.from_data(data=data_valid, preproc=preproc_text_listings, on_disk=True)
# Build models model = TabularModel.build(preproc=preproc, size="tiny" if quick else "small") model.device = device # Device to None means it will be set to CUDA if the latter is available, otherwise CPU
model_text_host = TextModel.build( preproc=preproc_text_host, size="tiny" if quick else "normal", block_size=max(dataset_text_host_train.max_text_len, dataset_text_host_valid.max_text_len), ) model_text_host.device = device
model_text_listings = TextModel.build( preproc=preproc_text_listings, size="tiny" if quick else "normal", block_size=max(dataset_text_listings_train.max_text_len, dataset_text_listings_valid.max_text_len), ) model_text_listings.device = device
# Train the models TabularTrainer(model=model).train( dataset=dataset_train, n_epochs=n_epochs, n_steps=n_steps, memory=memory, valid=Validation( dataset=dataset_valid, early_stop="normal", save_best=output_dir / "ckpt" / "tabular.pt", tensorboard=output_dir / "tb" / "tabular", each=valid_each, trigger="step", ), )
TextTrainer(model=model_text_host).train( dataset=dataset_text_host_train, n_epochs=n_epochs, n_steps=n_steps, memory=memory, valid=Validation( dataset=dataset_text_host_valid, early_stop="normal", save_best=output_dir / "ckpt" / "text_host.pt", tensorboard=output_dir / "tb" / "text_host", each=valid_each, trigger="step", ), )
TextTrainer(model=model_text_listings).train( dataset=dataset_text_listings_train, n_epochs=n_epochs, n_steps=n_steps, memory=memory, valid=Validation( dataset=dataset_text_listings_valid, early_stop="normal", save_best=output_dir / "ckpt" / "text_listings.pt", tensorboard=output_dir / "tb" / "text_listings", each=valid_each, trigger="step", ), )
# Generate synthetic data data_synth = model.generate( n_samples=data["host"].shape[0], batch_size=1024, ) data_synth = model_text_host.generate( data=data_synth, batch_size=512, ) data_synth = model_text_listings.generate( data=data_synth, batch_size=512, )
synth_dir = output_dir / "synth" data_synth.to_csv(synth_dir, escapechar="\\")
# Revert to the original form df_synth = postproc_data(data=data_synth).loc[:, df.columns] df_synth.to_csv(synth_dir / "airbnb.csv", index=False, escapechar="\\")
# Compute and print PDF report report( data_train=data_train, data_test=data_test, data_synth=data_synth, path=output_dir / "report.pdf", )
# Compute extra privacy stats and print some results privacy_stats = compute_privacy_stats( data_train=data_train, data_synth=data_synth, ) privacy_stats_out = { t: { "privacy_score": ps.privacy_score, "privacy_score_std": ps.privacy_score_std, "%_points_at_risk": ps.risk * 100, } for t, ps in privacy_stats.items() } with open(output_dir / "privacy_stats.json", mode="w", encoding="utf-8") as f: json.dump(privacy_stats_out, f)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("data_dir", type=Path, help="The directory were to find the 'airbnb' dataset") parser.add_argument("output_dir", type=Path, help="The output directory") parser.add_argument( "--n", "-n", type=int, default=1000, help="Training epochs (or steps if the --steps flag is used)", ) parser.add_argument("--steps", "-s", action="store_true", help="Use steps instead of epochs") parser.add_argument("--valid-each", "-v", type=int, default=200, help="# steps between validations") parser.add_argument("--device", "-g", default=None, help="Training device") parser.add_argument("--memory", "-m", type=int, default=4096, help="Available memory (MB)") parser.add_argument( "--quick", "-q", action="store_true", help="Perform a quick test run, with reduced data and small models" ) args = parser.parse_args()
example_airbnb( data_dir=args.data_dir, output_dir=args.output_dir, n_epochs=None if args.steps else args.n, n_steps=args.n if args.steps else None, valid_each=args.valid_each, device=args.device, memory=args.memory, quick=args.quick, )