Explorar el Código

kfold validation

Lucas Gautheron hace 1 año
padre
commit
565f914049
Se han modificado 1 ficheros con 41 adiciones y 3 borrados
  1. 41 3
      analyses/category_prediction.py

+ 41 - 3
analyses/category_prediction.py

@@ -64,6 +64,7 @@ if __name__ == '__main__':
     articles = articles[articles["date_created"].str.len() >= 4]
     articles["year"] = articles["date_created"].str[:4].astype(int)-1980
     articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
+    articles["year_group"] = articles["year"]//5
 
     if args.reuse_articles:
         used = pd.read_csv(opj(args.location, 'articles.csv'))
@@ -243,15 +244,13 @@ if __name__ == '__main__':
                 'rank': j
             })
 
-        years = validation["year"].values.astype(int)
         predictions = fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab])
         dummy_predictions = dummies[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab])
 
         validation[f"accurate_{i}"] = predictions==y_hat
-        validation[f"dummy_accurate_{i}"] = predictions==y_hat
+        validation[f"dummy_accurate_{i}"] = dummy_predictions==y_hat
         validation[f"truth_{i}"] = y_hat
 
-    validation["year_group"] = validation["year"]//5
     validation.groupby("year_group").agg(
         accurate_0=("accurate_0", "mean"),
         accurate_1=("accurate_1", "mean"),
@@ -267,6 +266,45 @@ if __name__ == '__main__':
         count_2=("truth_2", "count"),
     ).to_csv(opj(args.location, "accuracy_per_period.csv"))
 
+    kfold = []
+
+    
+    for year_group, test in articles.groupby("year_group"):
+        train = articles[articles["year_group"] != year_group]
+        accurate = np.zeros(3)
+        dummy_accurate = np.zeros(3)
+        truth = np.zeros(3)
+        count = np.zeros(3)
+
+        for i in range(3):
+            kfold_fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
+            y_hat = np.stack(test["cats"].values).astype(int)[:,i]
+
+            predictions = kfold_fit.predict(np.stack(test["bow_tfidf"].values)[:,0:vocab])
+            dummy_predictions = dummies[i].predict(np.stack(test["bow_tfidf"].values)[:,0:vocab])
+
+            accurate[i] = (predictions==y_hat).mean()
+            dummy_accurate[i] = (dummy_predictions==y_hat).mean()
+            truth[i] = y_hat.mean()
+            count[i] = len(test)
+
+        kfold.append({
+            "year_group": year_group,
+            "accurate_0": accurate[0],
+            "accurate_1": accurate[1],
+            "accurate_2": accurate[2],
+            "dummy_accurate_0": dummy_accurate[0],
+            "dummy_accurate_1": dummy_accurate[1],
+            "dummy_accurate_2": dummy_accurate[2],
+            "truth_0": truth[0],
+            "truth_1": truth[1],
+            "truth_2": truth[2],
+            "count_0": count[0],
+            "count_1": count[1],
+            "count_2": count[2],
+        })
+
+    pd.DataFrame(kfold).to_csv(opj(args.location, "accuracy_per_period_kfold.csv"))
 
     results = pd.DataFrame(results)
     results["drop"] = False