1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- %% Good Judgement Project Data processing
- % This script reads the forecasting data tables from the GJP, identifies
- % key variables, removes unnecessary lines and variables, calculates brier
- % scores and appends relevant info columns. Then saves a matlab data structure
- % with new tables with the relevant information needed for our own questions.
- clc
- clear all
- close all
- tic
-
- DataPath = 'data\'; % Path of the original data. I used the tab separated data.
- %%%% Data for individuals participating in the Good Judgment Project
- Individual_data = readtable([DataPath 'all_individual_differences.csv'],'Delimiter','tab'); % Enormous table with variables for individuals
- %%%% Forecasting data for each of the 4 years of the project
- Forecasts_y1 = readtable([DataPath 'survey_fcasts.yr1.csv'],'Delimiter','tab');
- Forecasts_y2 = readtable([DataPath 'survey_fcasts.yr2.csv'],'Delimiter','tab');
- Forecasts_y3 = readtable([DataPath 'survey_fcasts.yr3.csv'],'Delimiter','tab');
- Forecasts_y4 = readtable([DataPath 'survey_fcasts.yr4.csv'],'Delimiter','tab');
- %%%% Questions used in the forecasting tournament (individual forecasting problems, ifps)
- [~,~,rawifps] = xlsread([DataPath 'ifps.xlsx']);
- ifps = cell2table(rawifps(2:end,:), 'VariableNames', rawifps(1,:));
- %% adding a column to forecast tables declaring the year and concatenating them.
- %%%% To manage only one table instead of one per year
- Forecasts_y1.year (1:numel(Forecasts_y1.value)) = 1;
- Forecasts_y2.year (1:numel(Forecasts_y2.value)) = 2;
- Forecasts_y3.year (1:numel(Forecasts_y3.value)) = 3;
- Forecasts_y4.year (1:numel(Forecasts_y4.value)) = 4;
- Forecasts_all = cat(1,Forecasts_y1,Forecasts_y2,Forecasts_y3,Forecasts_y4);
- clear Forecasts_y1 Forecasts_y2 Forecasts_y3 Forecasts_y4 rawifps
- %% removing some lines and columns from the original data that i will not need (to save memory space)
- Forecasts_all(strcmp('voided',Forecasts_all.q_status),:) = []; % removing voided questions by the data collectors
- Forecasts_all.q_status = []; % question status irrelevant, as now they are all 'closed', valid questions.
- Forecasts_all.forecast_id = []; % specific identifier for each entry, irrelevant, i think.
- Forecasts_all.fcast_date = []; % redundant info, as the date is also included in the timestamp.
- %% Removing non-binary questions from the data tables.
- %%% creating flags for binary and non-binary questions, and removing non/binary, as we want to analize only binary questions
- % ifps.binary = ifps.n_opts==2; % Flag for binary questions
- nonbinary.ifps = ifps.n_opts~=2; % Flag for non-binary questions
- ifps(nonbinary.ifps,:) = []; % Removing non-binary questions from the questions table.
- %%% Removing the question lines that are no longer in the IFPS table, from the forecast data tables.
- nonbinary.Forecasts_all = ~ismember(Forecasts_all.ifp_id,ifps.ifp_id);
- Forecasts_all(nonbinary.Forecasts_all,:) = [];
- clear nonbinary
- %% Removing complementary forecasts for the same entry
- % In the original forecast data tables, the probability assigned to each
- % ifp is declared for every possible outcome. In the case of binary
- % questions, this is redudndant information. So here i will delete one of
- % the two lines corresponding for each forecast, to simplify our life
- % when analyzing later.
- ans_opt_2remove = 'b'; % string to choose which answers to remove.
- %%% actually removing the lines corresponding to the above chosen answer option from the forecast data tables
- Forecasts_all(strcmp(ans_opt_2remove,Forecasts_all.answer_option),:) = [];
- %% Now calculating Brier scores for each of the remaining forecasts in the table
- Briers (1:size(Forecasts_all,1),1) = nan;
- for i = 1:size(Forecasts_all,1)
- question = Forecasts_all.ifp_id{i};
- answer = Forecasts_all.value(i);
- outcome = ifps.outcome(strcmp(question,ifps.ifp_id));
- if outcome == 'a'; outcome4BrierFunction = 1;
- elseif outcome == 'b'; outcome4BrierFunction = 0; end
- Briers (i) = BrierScoreCalc(answer,outcome4BrierFunction);
- end
- %%%% appending the calculated briers to the data table.
- Forecasts_all.Brier = Briers;
- toc
- %% saving the forecast big table with 4 years of data,
- % the data table about the questions, and the data about the participants.
- tic
- %%% as a matlab file
- save(['D:\Ferreiro\BahramiLab\GoodJudgement\Processed_data\' 'processedGJP.mat'],'Forecasts_all','ifps','Individual_data','-v7.3')
- toc
|