export.py 812 B

12345678910111213141516171819
  1. import pandas as pd
  2. df = pd.read_parquet("acl.parquet")
  3. articles = df
  4. articles["article_id"] = df.index.astype(int)
  5. articles["date_created"] = articles["year"]
  6. articles["categories"] = None
  7. articles_authors = articles[["article_id", "acl_id", "author"]]
  8. articles_authors["bai"] = articles_authors["author"].str.split("and\n").fillna("")
  9. articles_authors = articles_authors.explode("bai")
  10. articles_authors["bai"] = articles_authors["bai"].str.lower().str.strip()
  11. articles_authors["bai"] = articles_authors["bai"].str.extract(r"^((?:.*), [a-z])", expand=False)
  12. articles_authors = articles_authors[articles_authors["bai"].fillna("").map(len)>1]
  13. articles_authors = articles_authors[["article_id", "acl_id", "bai"]]
  14. articles_authors.to_parquet("articles_authors.parquet")
  15. articles.to_parquet("articles.parquet")