lucasgautheron
/
trading_zones_material


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
							from AbstractSemantics.terms import TermExtractor
import pandas as pd
import numpy as np
from os.path import join as opj
from os.path import exists

import itertools
from functools import partial
from collections import defaultdict

import re

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

import multiprocessing as mp

from matplotlib import pyplot as plt

import argparse
import yaml
import sys

if __name__ == '__main__':

    parser = argparse.ArgumentParser('CT Model')
    parser.add_argument('location', help='model directory')
    parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
    parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
    parser.add_argument('--samples', type=int, default=50000)
    parser.add_argument('--constant-sampling', type=int, default=0)
    parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
    parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
    parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
    parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
    parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
    parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
    parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
    parser.add_argument('--add-title', default=False, action="store_true", help="include title")
    parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
    parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
    parser.add_argument('--min-df', type=int, default=0, help='min_df')
    parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
    parser.add_argument('--threads', type=int, default=4)
    args = parser.parse_args(["output/category_prediction", "categories", "--values", "Experiment-HEP", "Phenomenology-HEP", "Theory-HEP", "--samples", "110000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"])

    with open(opj(args.location, "params.yml"), "w+") as fp:
        yaml.dump(args, fp)

    articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]

    if args.add_title:
        articles["abstract"] = articles["abstract"].str.cat(articles["title"])

    articles.drop(columns = ["title"], inplace=True)

    if args.remove_latex:
        articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))

    articles = articles[articles["abstract"].map(len)>=100]
    articles["abstract"] = articles["abstract"].str.lower()

    articles = articles[articles["date_created"].str.len() >= 4]
    articles["year"] = articles["date_created"].str[:4].astype(int)-1980
    articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
    articles["year_group"] = articles["year"]//5

    if args.reuse_articles:
        used = pd.read_csv(opj(args.location, 'articles.csv'))
        articles = articles[articles["article_id"].isin(used["article_id"])]
    else:
        articles = articles[~articles["abstract"].isnull()]

        if args.constant_sampling > 0:
            articles = articles.groupby("year").head(args.constant_sampling)

        keep = pd.Series([False]*len(articles), index=articles.index)

        print("Applying filter...")
        if args.filter == 'keywords':
            for value in args.values:
                keep |= articles["abstract"].str.contains(value)
        elif args.filter == 'categories':
            for value in args.values:
                keep |= articles["categories"].apply(lambda l: value in l)

        articles = articles[keep==True]
        articles = articles.sample(frac=1).head(args.samples)
        articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))

    articles.reset_index(inplace = True)

    print("Extracting n-grams...")
    extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)

    if args.nouns:
        extractor.add_patterns([["NN.*"]])

    if args.adjectives:
        extractor.add_patterns([["^JJ$"]])

    ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
    ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
    ngrams = list(ngrams)

    articles["ngrams"] = ngrams

    print("Deriving vocabulary...")
    if not args.reuse_stored_vocabulary:
        ngrams_occurrences = defaultdict(int)

        for ngrams in articles["ngrams"].tolist():
            _ngrams = set(ngrams)
            for ngram in _ngrams:
                ngrams_occurrences[ngram] += 1

        ngrams_occurrences = pd.DataFrame(
            {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
        )
        ngrams_occurrences["unithood"] = (
            np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
            * ngrams_occurrences["count"]
        )
        ngrams_occurrences["unithood"] /= len(articles)
        ngrams_occurrences.set_index("ngram", inplace=True)

        ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
        ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]

        top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
            args.top_unithood
        )

        top.to_csv(opj(args.location, "ngrams.csv"))

    
    selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()

    vocabulary = {
        n: i
        for i, n in enumerate(selected_ngrams)
    }

    ngrams = articles["ngrams"].tolist()
    ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]

    bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
    bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow]
    bow = np.array(bow)

    tfidf = TfidfTransformer()
    bow_tfidf = tfidf.fit_transform(bow).todense().tolist()
    articles["bow_tfidf"] = bow_tfidf

    cat_classifier = MultiLabelBinarizer(sparse_output=False)
    articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Experiment-HEP", "Phenomenology-HEP", "Theory-HEP"}))
    cats = cat_classifier.fit_transform(articles["categories"]).tolist()
    articles["cats"] = cats

    training, validation = train_test_split(articles, train_size=100000/110000)

    from sklearn.linear_model import LogisticRegression
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import f1_score

    dummies = dict()
    fit = dict()
    scores = dict()
    f1 = dict()

    dummies_scores = dict()
    dummies_f1 = dict()

    score_vs_vocab_size = []

    for vocab in [50] + list(np.arange(125, 1000, 125)):
        score = 0
        for i in range(3):
            dummies[i] = DummyClassifier(strategy="most_frequent")
            dummies[i].fit(np.stack(training["bow_tfidf"].values), np.stack(training["cats"].values).astype(int)[:,i])

            fit[i] = LogisticRegression(random_state=0, max_iter=200).fit(np.stack(training["bow_tfidf"].values)[:,0:vocab], np.stack(training["cats"].values).astype(int)[:,i])
            y_hat = np.stack(validation["cats"].values).astype(int)[:,i]
            scores[i] = fit[i].score(np.stack(validation["bow_tfidf"].values)[:,0:vocab], y_hat)
            f1[i] = f1_score(y_hat, fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab]))
            score += f1[i]

            dummies_scores[i] = dummies[i].score(np.stack(validation["bow_tfidf"].values), y_hat)
            dummies_f1[i] = f1_score(y_hat, dummies[i].predict(np.stack(validation["bow_tfidf"].values)))

        print(vocab, score)
        score_vs_vocab_size.append({
            'vocab': vocab,
            'f1': score,
            "acc_0": scores[0],
            "acc_1": scores[1],
            "acc_2": scores[2],
            "baseline_acc_0": dummies_scores[0],
            "baseline_acc_1": dummies_scores[1],
            "baseline_acc_2": dummies_scores[2],
            "f1_0": f1[0],
            "f1_1": f1[1],
            "f1_2": f1[2],
        })

    score_vs_vocab_size = pd.DataFrame(score_vs_vocab_size)
    score_vs_vocab_size.to_csv(opj(args.location, "vocab_performance.csv"))

    vocab = 500

    scores = dict()
    f1 = dict()

    dummies_scores = dict()
    dummies_f1 = dict()

    frequency = np.stack(articles["cats"].values).sum(axis=0)
    results = []

    inv_vocabulary = {
        vocabulary[v]: v
        for v in vocabulary
    }

    for i in range(3):
        dummies[i] = DummyClassifier(strategy="most_frequent")
        dummies[i].fit(np.stack(training["bow_tfidf"].values), np.stack(training["cats"].values).astype(int)[:,i])
        
        fit[i] = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(training["bow_tfidf"].values)[:,0:vocab], np.stack(training["cats"].values).astype(int)[:,i])

        y_hat = np.stack(validation["cats"].values).astype(int)[:,i]
        scores[i] = fit[i].score(np.stack(validation["bow_tfidf"].values)[:,0:vocab], y_hat)
        f1[i] = f1_score(y_hat, fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab]))
        
        dummies_scores[i] = dummies[i].score(np.stack(validation["bow_tfidf"].values), y_hat)
        dummies_f1[i] = f1_score(y_hat, dummies[i].predict(np.stack(validation["bow_tfidf"].values)))

        for j in range(vocab):
            results.append({
                'term': inv_vocabulary[j],
                'category': cat_classifier.inverse_transform(np.array([np.identity(3)[i,:]]))[0][0],
                'coef': fit[i].coef_[0,j],
                'rank': j
            })

        predictions = fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab])
        dummy_predictions = dummies[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab])

        validation[f"accurate_{i}"] = predictions==y_hat
        validation[f"dummy_accurate_{i}"] = dummy_predictions==y_hat
        validation[f"truth_{i}"] = y_hat

    validation.groupby("year_group").agg(
        accurate_0=("accurate_0", "mean"),
        accurate_1=("accurate_1", "mean"),
        accurate_2=("accurate_2", "mean"),
        dummy_accurate_0=("dummy_accurate_0", "mean"),
        dummy_accurate_1=("dummy_accurate_1", "mean"),
        dummy_accurate_2=("dummy_accurate_2", "mean"),
        truth_0=("truth_0", "sum"),
        truth_1=("truth_1", "sum"),
        truth_2=("truth_2", "sum"),
        count_0=("truth_0", "count"),
        count_1=("truth_1", "count"),
        count_2=("truth_2", "count"),
    ).to_csv(opj(args.location, "accuracy_per_period.csv"))

    kfold = []

    
    for year_group, test in articles.groupby("year_group"):
        train = articles[articles["year_group"] != year_group]
        accurate = np.zeros(3)
        dummy_accurate = np.zeros(3)
        truth = np.zeros(3)
        count = np.zeros(3)

        for i in range(3):
            kfold_fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
            y_hat = np.stack(test["cats"].values).astype(int)[:,i]

            predictions = kfold_fit.predict(np.stack(test["bow_tfidf"].values)[:,0:vocab])
            dummy_predictions = dummies[i].predict(np.stack(test["bow_tfidf"].values)[:,0:vocab])

            accurate[i] = (predictions==y_hat).mean()
            dummy_accurate[i] = (dummy_predictions==y_hat).mean()
            truth[i] = y_hat.mean()
            count[i] = len(test)

        kfold.append({
            "year_group": year_group,
            "accurate_0": accurate[0],
            "accurate_1": accurate[1],
            "accurate_2": accurate[2],
            "dummy_accurate_0": dummy_accurate[0],
            "dummy_accurate_1": dummy_accurate[1],
            "dummy_accurate_2": dummy_accurate[2],
            "truth_0": truth[0],
            "truth_1": truth[1],
            "truth_2": truth[2],
            "count_0": count[0],
            "count_1": count[1],
            "count_2": count[2],
        })

    pd.DataFrame(kfold).to_csv(opj(args.location, "accuracy_per_period_kfold.csv"))

    results = pd.DataFrame(results)
    results["drop"] = False

    bow = (bow>=1).astype(int)
    num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2)
    den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000
    npmi = np.log(num)/np.log(den)-1

    x, y = np.where(npmi-np.identity(vocab)>=0.95)
    for k,_ in enumerate(x):
        i = x[k]
        j = y[k]

        a = inv_vocabulary[i]
        b = inv_vocabulary[j]

        if (not (a in b or b in a)):
            continue

        if i > j:
            results.loc[results['rank'] == i, 'drop'] = True
        else:
            results.loc[results['rank'] == j, 'drop'] = True

    results = results[results["drop"]==False]
    results = results[results["term"].str.match("^[a-zA-Z--- ]*$")]

    results = results.pivot(index="term",columns="category",values="coef")
    results["ph_minus_th"] = results["Phenomenology-HEP"]-results["Theory-HEP"]
    results["ph_minus_exp"] = results["Phenomenology-HEP"]-results["Experiment-HEP"]
    results.sort_values("ph_minus_th").to_csv(opj(args.location, "results.csv"))

    import matplotlib
    from matplotlib import pyplot as plt 
    matplotlib.use("pgf")
    matplotlib.rcParams.update(
        {
            "pgf.texsystem": "xelatex",
            "font.family": "serif",
            "font.serif": "Times New Roman",
            "text.usetex": True,
            "pgf.rcfonts": False,
        }
    )

    cats = {"exp": "Experiment-HEP", "th": "Theory-HEP"}
    cats_friendly = {"th": "Theory", "exp": "Experiment"}

    table = []

    for cat in cats:
        top = results[results["Phenomenology-HEP"]>0].sort_values(f"ph_minus_{cat}", ascending=False).head(40).index.values
        bottom = results[results[cats[cat]]>0].sort_values(f"ph_minus_{cat}", ascending=True).head(40).index.values
        
        table.append({
            'Reference category': cats_friendly[cat],
            'Relation to phenomenology': "Vocabulary specific to phenomenology",
            'words': ", ".join(top)
        })
        
        table.append({
            'Reference category': cats_friendly[cat],
            'Relation to phenomenology': "Vocabulary specific to theory",
            'words': ", ".join(bottom)
        })
        
    table = pd.DataFrame(table)
    table = table.pivot(index="Reference category", columns="Relation to phenomenology", values="words")

    with pd.option_context("display.max_colwidth", None):
        latex = table.to_latex(
            longtable=True,
            multirow=True,
            multicolumn=True,
            bold_rows=True,
            header=True,
            index_names=False,
            column_format='p{3cm}|p{5cm}|p{5cm}',
            caption="Vocabulary specific to each category. The left column lists expressions that discriminate experiment and theory from phenomenology. The right column lists expressions that are the most specific to phenomenology and foreigh to experiment and theory.",
            label="table:specific_pheno_vocabulary"
        )

    with open("tables/specific_vocabulary.tex", "w+") as fp:
        fp.write(latex)


    for cat in ["th", "exp"]:
        table = []
        top = results[results["Phenomenology-HEP"]>0].sort_values(f"ph_minus_{cat}", ascending=False).head(45).index.values
        bottom = results[results[cats[cat]]>0].sort_values(f"ph_minus_{cat}", ascending=True).head(45).index.values
            
        table.append({
            'Reference category': cats_friendly[cat],
            'Relation to phenomenology': "Vocabulary specific to phenomenology",
            'words': ", ".join(top)
        })
        
        table.append({
            'Reference category': cats_friendly[cat],
            'Relation to phenomenology': f"Vocabulary specific to {cats_friendly[cat].lower()}",
            'words': ", ".join(bottom)
        })
            
        table = pd.DataFrame(table)
        table = table.pivot(index="Reference category", columns="Relation to phenomenology", values="words")

        caption = f"Vocabulary specific to phenomenology (left column) versus {cats_friendly[cat].lower()} (right column)."

        with pd.option_context("display.max_colwidth", None):
            latex = table.to_latex(
                longtable=True,
                multirow=True,
                multicolumn=True,
                bold_rows=True,
                header=True,
                index_names=False,
                index=False,
                column_format='p{7cm}|p{7cm}',
                caption=caption,
                label=f"table:specific_pheno_vocabulary_{cat}_ph",
                columns = ["Vocabulary specific to phenomenology", f"Vocabulary specific to {cats_friendly[cat].lower()}"]
            )

        with open(f"tables/specific_vocabulary_{cat}_ph.tex", "w+") as fp:
            fp.write(latex)