GJP_data_initial_processing1.m 4.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. %% Good Judgement Project Data processing
  2. % This script reads the forecasting data tables from the GJP, identifies
  3. % key variables, removes unnecessary lines and variables, calculates brier
  4. % scores and appends relevant info columns. Then saves a matlab data structure
  5. % with new tables with the relevant information needed for our own questions.
  6. clc
  7. clear all
  8. close all
  9. tic
  10. DataPath = 'data\'; % Path of the original data. I used the tab separated data.
  11. %%%% Data for individuals participating in the Good Judgment Project
  12. Individual_data = readtable([DataPath 'all_individual_differences.csv'],'Delimiter','tab'); % Enormous table with variables for individuals
  13. %%%% Forecasting data for each of the 4 years of the project
  14. Forecasts_y1 = readtable([DataPath 'survey_fcasts.yr1.csv'],'Delimiter','tab');
  15. Forecasts_y2 = readtable([DataPath 'survey_fcasts.yr2.csv'],'Delimiter','tab');
  16. Forecasts_y3 = readtable([DataPath 'survey_fcasts.yr3.csv'],'Delimiter','tab');
  17. Forecasts_y4 = readtable([DataPath 'survey_fcasts.yr4.csv'],'Delimiter','tab');
  18. %%%% Questions used in the forecasting tournament (individual forecasting problems, ifps)
  19. [~,~,rawifps] = xlsread([DataPath 'ifps.xlsx']);
  20. ifps = cell2table(rawifps(2:end,:), 'VariableNames', rawifps(1,:));
  21. %% adding a column to forecast tables declaring the year and concatenating them.
  22. %%%% To manage only one table instead of one per year
  23. Forecasts_y1.year (1:numel(Forecasts_y1.value)) = 1;
  24. Forecasts_y2.year (1:numel(Forecasts_y2.value)) = 2;
  25. Forecasts_y3.year (1:numel(Forecasts_y3.value)) = 3;
  26. Forecasts_y4.year (1:numel(Forecasts_y4.value)) = 4;
  27. Forecasts_all = cat(1,Forecasts_y1,Forecasts_y2,Forecasts_y3,Forecasts_y4);
  28. clear Forecasts_y1 Forecasts_y2 Forecasts_y3 Forecasts_y4 rawifps
  29. %% removing some lines and columns from the original data that i will not need (to save memory space)
  30. Forecasts_all(strcmp('voided',Forecasts_all.q_status),:) = []; % removing voided questions by the data collectors
  31. Forecasts_all.q_status = []; % question status irrelevant, as now they are all 'closed', valid questions.
  32. Forecasts_all.forecast_id = []; % specific identifier for each entry, irrelevant, i think.
  33. Forecasts_all.fcast_date = []; % redundant info, as the date is also included in the timestamp.
  34. %% Removing non-binary questions from the data tables.
  35. %%% creating flags for binary and non-binary questions, and removing non/binary, as we want to analize only binary questions
  36. % ifps.binary = ifps.n_opts==2; % Flag for binary questions
  37. nonbinary.ifps = ifps.n_opts~=2; % Flag for non-binary questions
  38. ifps(nonbinary.ifps,:) = []; % Removing non-binary questions from the questions table.
  39. %%% Removing the question lines that are no longer in the IFPS table, from the forecast data tables.
  40. nonbinary.Forecasts_all = ~ismember(Forecasts_all.ifp_id,ifps.ifp_id);
  41. Forecasts_all(nonbinary.Forecasts_all,:) = [];
  42. clear nonbinary
  43. %% Removing complementary forecasts for the same entry
  44. % In the original forecast data tables, the probability assigned to each
  45. % ifp is declared for every possible outcome. In the case of binary
  46. % questions, this is redudndant information. So here i will delete one of
  47. % the two lines corresponding for each forecast, to simplify our life
  48. % when analyzing later.
  49. ans_opt_2remove = 'b'; % string to choose which answers to remove.
  50. %%% actually removing the lines corresponding to the above chosen answer option from the forecast data tables
  51. Forecasts_all(strcmp(ans_opt_2remove,Forecasts_all.answer_option),:) = [];
  52. %% Now calculating Brier scores for each of the remaining forecasts in the table
  53. Briers (1:size(Forecasts_all,1),1) = nan;
  54. for i = 1:size(Forecasts_all,1)
  55. question = Forecasts_all.ifp_id{i};
  56. answer = Forecasts_all.value(i);
  57. outcome = ifps.outcome(strcmp(question,ifps.ifp_id));
  58. if outcome == 'a'; outcome4BrierFunction = 1;
  59. elseif outcome == 'b'; outcome4BrierFunction = 0; end
  60. Briers (i) = BrierScoreCalc(answer,outcome4BrierFunction);
  61. end
  62. %%%% appending the calculated briers to the data table.
  63. Forecasts_all.Brier = Briers;
  64. toc
  65. %% saving the forecast big table with 4 years of data,
  66. % the data table about the questions, and the data about the participants.
  67. tic
  68. %%% as a matlab file
  69. save(['D:\Ferreiro\BahramiLab\GoodJudgement\Processed_data\' 'processedGJP.mat'],'Forecasts_all','ifps','Individual_data','-v7.3')
  70. toc