123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- import nest_asyncio
- nest_asyncio.apply()
- del nest_asyncio
- import stan
- import pandas as pd
- import numpy as np
- import matplotlib
- import matplotlib.pyplot as plt
- from scipy.stats import beta
- import argparse
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "xelatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- def is_susy(abstract: str):
- abstract = abstract.lower()
- return (
- "supersymmetry" in abstract
- or "supersymmetric" in abstract
- or "susy" in abstract
- )
- def is_hep(categories: str):
- return any(["-HEP" in x for x in categories])
- parser = argparse.ArgumentParser()
- parser.add_argument('action', choices=["compute", "plot"])
- args = parser.parse_args()
- if args.action == "compute":
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
- articles["is_hep"] = articles.categories.map(is_hep)
- articles["is_th"] = articles.categories.map(lambda l: "Theory-HEP" in l)
- articles["is_exp"] = articles.categories.map(lambda l: "Experiment-HEP" in l)
- articles["is_pheno"] = articles.categories.map(lambda l: "Phenomenology-HEP" in l)
- #articles["is_susy"] = articles["abstract"].map(is_susy)
- articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(float).fillna(0).astype(int)
- articles = articles[articles["is_hep"]]
- articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
- authors = pd.read_parquet("projects/inspire-harvest/database/articles_authors.parquet")[["article_id", "bai"]]
- authors = authors.merge(articles[["article_id", "is_th", "is_exp", "is_pheno"]], how='right')
- authors = authors.groupby("bai").agg(
- th = ('is_th', 'sum'),
- exp = ('is_exp', 'sum'),
- ph = ('is_pheno', 'sum'),
- total = ('article_id', 'count'),
- th_frac = ('is_th', 'mean'),
- exp_frac = ('is_exp', 'mean'),
- ph_frac = ('is_pheno', 'mean')
- )
- authors = authors[authors['total'] >= 3]
- authors = authors.sample(frac=1).head(2500)
- authors.to_parquet('output/social_divide_authors.parquet')
- data = {
- 'n_authors': len(authors),
- 'n_types': 3,
- 'n_articles': authors['total'].astype(int).values
- }
- counts_per_type = np.zeros((len(authors),3))
- for i,t in enumerate(['th', 'exp', 'ph']):
- counts_per_type[:,i] = authors[t].astype(int).values
- data['counts_per_type'] = counts_per_type.astype(int)
- stan_code = """
- data {
- int n_authors;
- int n_types;
- int n_articles[n_authors];
- int counts_per_type[n_authors,n_types];
- }
- parameters {
- matrix <lower=0, upper=1> [n_authors, n_types] probs;
- vector<lower=0>[n_types] alphas;
- vector<lower=0>[n_types] betas;
- // vector<lower=0,upper=1>[n_types] mus;
- // vector<lower=0.1>[n_types] etas;
- }
- // transformed parameters {
- // vector<lower=0>[n_types] alphas;
- // vector<lower=0>[n_types] betas;
- //
- // for (i in 1:n_types) {
- // alphas[i] = mus[i] * etas[i];
- // betas[i] = (1-mus[i]) * etas[i];
- // }
- // }
- model {
- for (i in 1:n_authors) {
- for (j in 1:n_types) {
- probs[i,j] ~ beta(alphas[j], betas[j]);
- counts_per_type[i,j] ~ binomial(n_articles[i], probs[i,j]);
- }
- }
- for (i in 1:n_types) {
- alphas[i] ~ exponential(1);
- betas[i] ~ exponential(1);
- }
- }
- """
- posterior = stan.build(stan_code, data = data)
- fit = posterior.sample(num_chains = 4, num_samples = 1000)
- df = fit.to_frame()
- df.to_parquet(f'output/social_divide_samples.parquet')
- else:
- df = pd.read_parquet(f'output/social_divide_samples.parquet')
- data = {'n_authors': 2500, 'n_types': 3}
- n = len(df)
- cats = ['th','exp','ph']
- probs = dict()
- alphas = dict()
- betas = dict()
- for t in np.arange(data['n_types']):
- probs[t] = [df[f'probs.{k}.{t+1}'].values for k in 1+np.arange(data['n_authors'])]
- probs[t] = np.array(probs[t])
- alphas[t] = df[f'alphas.{t+1}'].values
- betas[t] = df[f'betas.{t+1}'].values
-
- bins = np.linspace(0,1,20,True)
- hats = np.zeros((data['n_types'], n, len(bins)-1))
-
- for line in np.arange(n):
- for j in np.arange(data['n_types']):
- hats[j,line,:] = np.histogram(
- np.array([df[f'p_hat.{i+1}.{j+1}'].iloc[line] for i in np.arange(data['n_authors'])]),
- bins=bins
- )[0]
- fig, axes = plt.subplots(nrows=1,ncols=3,sharey=True)
- authors = pd.read_parquet(f'output/social_divide_authors.parquet')
- x = np.linspace(0,1,100,True)
- for i, t in enumerate([1,2,0]):
- ax = axes[i]
- # ax.hist(
- # probs[t].flatten(),
- # bins=np.linspace(0,1,20,True),
- # histtype="step",
- # color='black',
- # density=True,
- # lw=1,
- # label="$p_{ij}$ (modèle)"
- # )
- m = np.mean(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0)
-
- ax.scatter(
- (bins[:-1]+bins[1:])/2,
- m,
- color="black",
- s=1,
- label="$\hat{p}_{ij}$ (modèle)"
- )
- lower = m-np.quantile(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0, q=0.05/2)
- upper = np.quantile(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0, q=1-0.05/2)-m
- ax.errorbar(
- (bins[:-1]+bins[1:])/2,
- m,
- (lower, upper),
- color="black",
- lw=1,
- ls="None"
- )
- ax.hist(
- authors[f"{cats[t]}_frac"],
- bins=np.linspace(0,1,20,True),
- histtype="step",
- color='blue',
- density=True,
- lw=1,
- linestyle='-',
- label="$\hat{p}_{ij}$ (données)"
- )
- ax.set_yscale('log')
- # for i in np.arange(100):
- # ax.plot(x, beta.pdf(x, alphas[t][-i], betas[t][-i]), alpha=0.01, color='blue')
- ax.set_xlim(0,1)
- cat = cats[t]
- ax.set_xlabel(f'$p_{{i,\\mathrm{{{cat}}}}}$')
- plt.legend()
- actors_desc = "physiciens" if args.target == "authors" else "institutions"
- fig.suptitle("Part des publications des "+actors_desc+" appartenant aux catégories\n``Expérience'' ($p_{exp}$), ``Phénoménologie'' ($p_{ph}$), ``Théorie'' ($p_{th}$)")
- plt.savefig(f"plots/social_divide_gof_{args.target}.pgf")
- plt.savefig(f"plots/social_divide_gof_{args.target}.pdf")
|