|
@@ -0,0 +1,212 @@
|
|
|
+from load_website_data import load_icl
|
|
|
+import numpy as np
|
|
|
+try:
|
|
|
+ from scipy.io import savemat
|
|
|
+except ImportError:
|
|
|
+ pass
|
|
|
+from crowd_labeling import CLLDA, concurrent_cllda, combine_cllda
|
|
|
+from crowd_labeling.MV import MV
|
|
|
+import cPickle as pkl
|
|
|
+import json
|
|
|
+from os.path import join, isfile, isdir
|
|
|
+import argparse
|
|
|
+from copy import deepcopy
|
|
|
+import sys
|
|
|
+
|
|
|
+# parse input arguments
|
|
|
+print('parsing arguments')
|
|
|
+parser = argparse.ArgumentParser(description='Run or update CL estimates.')
|
|
|
+parser.add_argument('database', type=str, help='Absolute reference to the sqlite database file.')
|
|
|
+parser.add_argument('save', type=str, help='Directory in which to save results.')
|
|
|
+parser.add_argument('-classifications', type=str, help='Directory in which to save results for website viewing.',
|
|
|
+ default=None)
|
|
|
+args = parser.parse_args()
|
|
|
+database = args.database
|
|
|
+path = args.save
|
|
|
+classifications_path = args.classifications
|
|
|
+assert isfile(database), 'database path does not exist'
|
|
|
+assert isdir(path), 'save path does not exist'
|
|
|
+assert isdir(classifications_path), 'classifications path does not exist'
|
|
|
+
|
|
|
+# load sqlite data
|
|
|
+print('loading database')
|
|
|
+icl_votes = load_icl(database)
|
|
|
+votes = icl_votes['votes']
|
|
|
+workers = icl_votes['workers']
|
|
|
+instances = icl_votes['instances']
|
|
|
+instance_set_numbers = icl_votes['instance_set_numbers']
|
|
|
+instance_ic_numbers = icl_votes['instance_ic_numbers']
|
|
|
+vote_ids = np.array(['Brain', 'Muscle', 'Eye', 'Heart', 'Line Noise', 'Chan Noise', 'Other', '?'])
|
|
|
+instance_ids = icl_votes['instance_ids']
|
|
|
+worker_ids = icl_votes['worker_ids']
|
|
|
+T = icl_votes['n_classes']
|
|
|
+C = icl_votes['n_responses']
|
|
|
+A = icl_votes['n_workers']
|
|
|
+is_expert = (icl_votes['is_expert']).astype(bool) # type = np.ndarray
|
|
|
+
|
|
|
+
|
|
|
+# append identifier to string
|
|
|
+def add_identifier(string, identifier):
|
|
|
+ return '_'.join((x for x in (string, identifier) if x is not None))
|
|
|
+
|
|
|
+
|
|
|
+# run cllda
|
|
|
+def run_cllda(save_path, vts, wks, its, vt_ids=None, it_ids=None, wk_ids=None, it_priors=None, wk_priors=None,
|
|
|
+ it_set_numbers=None, it_ic_numbers=None, identifier=None):
|
|
|
+
|
|
|
+ if isfile(join(save_path, add_identifier('icl_cllda_models', identifier) + '.pkl')):
|
|
|
+ # load for python
|
|
|
+ with open(join(save_path, add_identifier('icl_cllda_models', identifier) + '.pkl'), 'rb') as f:
|
|
|
+ cls = pkl.load(f)
|
|
|
+
|
|
|
+ # update CLLDA with all transforms
|
|
|
+ cls = concurrent_cllda(cls, vts, wks, its, nprocs=4, vote_ids=vt_ids, instance_ids=it_ids,
|
|
|
+ worker_ids=wk_ids, worker_prior=wk_priors, num_epochs=800, burn_in=0)
|
|
|
+
|
|
|
+ else:
|
|
|
+ # CLLDA with all transforms
|
|
|
+ cls = concurrent_cllda(4, vts, wks, its, nprocs=4, vote_ids=vt_ids, instance_ids=it_ids,
|
|
|
+ worker_ids=wk_ids, worker_prior=wk_priors, instance_prior=it_priors,
|
|
|
+ transform=('none', 'ilr', 'clr', 'alr'), num_epochs=1000, burn_in=200)
|
|
|
+
|
|
|
+ # save individual models for python
|
|
|
+ with open(join(save_path, add_identifier('icl_cllda_models', identifier) + '.pkl'), 'wb') as f:
|
|
|
+ pkl.dump(cls, f)
|
|
|
+
|
|
|
+ # combine models
|
|
|
+ cl = combine_cllda(cls)
|
|
|
+
|
|
|
+ # aggregate data
|
|
|
+ return {
|
|
|
+ 'instance_ids': cl.instance_ids,
|
|
|
+ 'worker_ids': cl.worker_ids,
|
|
|
+ 'vote_ids': cl.vote_ids,
|
|
|
+ 'instance_set_numbers': it_set_numbers.astype(int),
|
|
|
+ 'instance_ic_numbers': it_ic_numbers.astype(int),
|
|
|
+ 'transform': cl.transform,
|
|
|
+ 'labels': cl.labels,
|
|
|
+ 'labels_cov': cl.labels_cov,
|
|
|
+ 'worker_mats': cl.worker_mats,
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+# save results in 3 different formats
|
|
|
+def save_results(save_path, data, identifier=None):
|
|
|
+ # save combined model for php
|
|
|
+ print('saving for php')
|
|
|
+ json_data = deepcopy(data)
|
|
|
+ for key, val in json_data.iteritems():
|
|
|
+ if isinstance(val, np.ndarray):
|
|
|
+ json_data[key] = val.tolist()
|
|
|
+ elif isinstance(val, list):
|
|
|
+ for it, item in enumerate(val):
|
|
|
+ if isinstance(item, np.ndarray):
|
|
|
+ val[it] = item.tolist()
|
|
|
+ json_data[key] = val
|
|
|
+ with open(join(save_path, add_identifier('ICLabels', identifier) + '.json'), 'wb') as f:
|
|
|
+ json.dump(json_data, f)
|
|
|
+
|
|
|
+ # save combined model for python
|
|
|
+ print('saving for python')
|
|
|
+ with open(join(save_path, add_identifier('ICLabels', identifier) + '.pkl'), 'wb') as f:
|
|
|
+ pkl.dump(data, f)
|
|
|
+
|
|
|
+ # save combined model for matlab
|
|
|
+ if 'savemat' in sys.modules:
|
|
|
+ print('saving for matlab')
|
|
|
+ for key, val in data.iteritems():
|
|
|
+ if not isinstance(val, np.ndarray):
|
|
|
+ try:
|
|
|
+ val = np.array(val)
|
|
|
+ except ValueError:
|
|
|
+ data[key] = np.empty(len(val), dtype=np.object)
|
|
|
+ for it, item in enumerate(val):
|
|
|
+ data[key][it] = item
|
|
|
+ continue
|
|
|
+ if not np.issubdtype(val.dtype, np.number):
|
|
|
+ data[key] = val.astype(np.object)
|
|
|
+ savemat(join(save_path, add_identifier('ICLabels', identifier) + '.mat'), data)
|
|
|
+
|
|
|
+ # optionally save classifications for website viewing
|
|
|
+ if isdir(classifications_path) and all((x in data.keys() for x in ('labels', 'vote_ids',
|
|
|
+ 'instance_set_numbers', 'instance_ic_numbers'))):
|
|
|
+ path_str = join(classifications_path, add_identifier('website', identifier) + '_icl_')
|
|
|
+ with open(path_str + 'index.json', 'w') as f:
|
|
|
+ json.dump(zip(json_data['instance_set_numbers'], json_data['instance_ic_numbers']), f)
|
|
|
+ with open(path_str + 'classifications.json', 'w') as f:
|
|
|
+ try:
|
|
|
+ json.dump(json_data['labels'][np.where(np.array(data['transform']) == 'none')[0][0]], f)
|
|
|
+ except KeyError:
|
|
|
+ json.dump(json_data['labels'], f)
|
|
|
+ with open(path_str + 'classes.json', 'w') as f:
|
|
|
+ json.dump(json_data['vote_ids'][:-1], f)
|
|
|
+
|
|
|
+
|
|
|
+# CLLDA settings
|
|
|
+n_pseudovotes_e = 100
|
|
|
+n_pseudovotes_u = 1
|
|
|
+expert_prior = n_pseudovotes_e * (np.hstack((np.eye(T), np.zeros((T, 1))))) + 0.01
|
|
|
+user_prior = n_pseudovotes_u * (np.hstack((np.eye(T), np.zeros((T, 1))))) + 0.01
|
|
|
+all_priors = np.zeros((A, T, C))
|
|
|
+all_priors[is_expert.astype(np.bool), :, :] = np.tile(expert_prior[np.newaxis], [is_expert.sum(), 1, 1])
|
|
|
+all_priors[np.logical_not(is_expert), :, :] = np.tile(user_prior[np.newaxis], [np.logical_not(is_expert).sum(), 1, 1])
|
|
|
+instance_prior = np.histogram(votes, range(C))[0] / 100. / np.histogram(votes, range(C))[0].sum()
|
|
|
+
|
|
|
+# run and save CLLDA with experts
|
|
|
+tag = 'expert'
|
|
|
+print('Running CLLDA_' + tag + '...')
|
|
|
+out = run_cllda(path, votes, workers, instances, vote_ids, instance_ids, worker_ids, instance_prior,
|
|
|
+ all_priors, instance_set_numbers, instance_ic_numbers, tag)
|
|
|
+print('Saved individual CLLDA_' + tag + ' models')
|
|
|
+print('Saving combined results...')
|
|
|
+save_results(path, out, tag)
|
|
|
+print('Saved combined results')
|
|
|
+
|
|
|
+
|
|
|
+# run CLLDA without experts
|
|
|
+tag = 'noexpert'
|
|
|
+print('Running CLLDA_' + tag + '...')
|
|
|
+out = run_cllda(path, votes, workers, instances, vote_ids, instance_ids, worker_ids, instance_prior,
|
|
|
+ user_prior, instance_set_numbers, instance_ic_numbers, tag)
|
|
|
+print('Saved individual CLLDA_' + tag + ' models')
|
|
|
+print('Saving combined results...')
|
|
|
+save_results(path, out, tag)
|
|
|
+print('Saved combined results')
|
|
|
+
|
|
|
+
|
|
|
+# run and save with only luca
|
|
|
+
|
|
|
+# remove non-luca votes
|
|
|
+worker_ids_lu = worker_ids[0]
|
|
|
+luca_ind = np.in1d(workers, (0,))
|
|
|
+votes_lu = votes[luca_ind]
|
|
|
+workers_lu = workers[luca_ind]
|
|
|
+instances_lu = instances[luca_ind]
|
|
|
+
|
|
|
+# remove instances with votes that are unsure
|
|
|
+keep_index = np.logical_not(np.in1d(instances_lu, np.unique(instances_lu[votes_lu == 7])))
|
|
|
+votes_lu = votes_lu[keep_index]
|
|
|
+workers_lu = workers_lu[keep_index]
|
|
|
+instances_lu = instances_lu[keep_index]
|
|
|
+instance_ids_lu = instance_ids[np.unique(instances_lu)]
|
|
|
+
|
|
|
+# reset instance numbering
|
|
|
+instance_set_numbers_lu = np.array(instance_set_numbers)[np.unique(instances_lu)]
|
|
|
+instance_ic_numbers_lu = np.array(instance_ic_numbers)[np.unique(instances_lu)]
|
|
|
+instances_lu = np.array([{x: y for x, y in zip(np.unique(instances_lu),
|
|
|
+ np.arange(np.unique(instances_lu).size))}[z]
|
|
|
+ for z in instances_lu])
|
|
|
+
|
|
|
+# run MV
|
|
|
+cl = MV(votes_lu, workers_lu, instances_lu)
|
|
|
+
|
|
|
+# save results
|
|
|
+save_results(path, {
|
|
|
+ 'instance_ids': instance_ids_lu,
|
|
|
+ 'worker_ids': worker_ids_lu,
|
|
|
+ 'vote_ids': vote_ids,
|
|
|
+ 'instance_set_numbers': instance_set_numbers_lu,
|
|
|
+ 'instance_ic_numbers': instance_ic_numbers_lu,
|
|
|
+ 'labels': cl.labels,
|
|
|
+}, 'onlyluca')
|
|
|
+
|