12345678910111213141516171819 |
- import pandas as pd
- df = pd.read_parquet("acl.parquet")
- articles = df
- articles["article_id"] = df.index.astype(int)
- articles["date_created"] = articles["year"]
- articles["categories"] = None
- articles_authors = articles[["article_id", "acl_id", "author"]]
- articles_authors["bai"] = articles_authors["author"].str.split("and\n").fillna("")
- articles_authors = articles_authors.explode("bai")
- articles_authors["bai"] = articles_authors["bai"].str.lower().str.strip()
- articles_authors["bai"] = articles_authors["bai"].str.extract(r"^((?:.*), [a-z])", expand=False)
- articles_authors = articles_authors[articles_authors["bai"].fillna("").map(len)>1]
- articles_authors = articles_authors[["article_id", "acl_id", "bai"]]
- articles_authors.to_parquet("articles_authors.parquet")
- articles.to_parquet("articles.parquet")
|