import pandas as pd df = pd.read_parquet("acl.parquet") articles = df articles["article_id"] = df.index.astype(int) articles["date_created"] = articles["year"] articles["categories"] = None articles_authors = articles[["article_id", "acl_id", "author"]] articles_authors["bai"] = articles_authors["author"].str.split("and\n").fillna("") articles_authors = articles_authors.explode("bai") articles_authors["bai"] = articles_authors["bai"].str.lower().str.strip() articles_authors["bai"] = articles_authors["bai"].str.extract(r"^((?:.*), [a-z])", expand=False) articles_authors = articles_authors[articles_authors["bai"].fillna("").map(len)>1] articles_authors = articles_authors[["article_id", "acl_id", "bai"]] articles_authors.to_parquet("articles_authors.parquet") articles.to_parquet("articles.parquet")