# clean workspace rm(list=ls()) # The csv File has to be in the same directory setwd(dirname(rstudioapi::getSourceEditorContext()$path)) # clear the console cat("\014") # load libraries library(ggplot2) library(dplyr) library(ggpubr) library(ragg) library(reshape2) library(data.table) library(stringr) library(cowplot) library(ggrepel) library(forcats) library(yarrr) # Load the data # catch NA strings #neuro_data <- read.csv("results-survey197421_nurkomplett.csv",row.names=NULL,na.strings=c("","N/A"),sep=',') neuro_data2 <- read.csv("results-survey197421_alledaten.csv",row.names=NULL,na.strings=c("","N/A"),sep=',') neuro_data <- neuro_data2[!is.na(neuro_data2$My.current..primary..position.is.),] # Es geht um die Frage was die Leute die Daten teilen von den anderen Unterscheidet # und insgesamt um die Frage was es fuer Probleme gibt in unserer Dateninfrastruktur colnames(neuro_data)[1]<-"Response.ID" # remove one outlier ==> empty row neuro_data = neuro_data[neuro_data$Response.ID != 78,] # remove whitespaces and commas colnames(neuro_data) = str_replace_all(colnames(neuro_data)," ",".") colnames(neuro_data) = str_replace_all(colnames(neuro_data),",",".") ####################################################### #### Figure 1 ######################################### ############### #### Neuro Disciplines + Current Position ############# ####################################################### data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('I.work.at'), starts_with('My.current..'), starts_with('Which.neuroscience.discipline.s.'), starts_with('Please.state.if.your.') ) comb_string_vec = c('I.work.at', 'My.current..', 'Which.neuroscience.discipline.s.', 'Please.state.if.your.', 'Have.you.shared.data.with') comb_col_names = c('WorkPlaces', 'CurrentPosition', 'NeuroDiscipline', 'FimilarDataTypes', 'DataSharing') # Diese Schleife ist mit Vorsicht zu genießen. Entstehende long format Datensatz kann sehr groß werden library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 ################################ # make a Current Position plot temp = data %>% select(Response.ID,CurrentPosition) %>% na.omit() %>% unique() %>% group_by(CurrentPosition) %>% filter(n() >= 3) # calc relative frequency to make more own plots temp_relFreq <- 0 temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(percent = round(n / sum(n)*100,0)) %>% mutate(ordering = c(1,7,5,6,3,2,4)) %>% arrange(ordering) %>% mutate(CurrentPosition = str_replace(CurrentPosition,"Research data management focused staff", "RDM staff")) #mutate(CurrentPosition = fct_reorder(temp_relFreq$CurrentPosition, temp_relFreq$ordering, min)) # function to wrap around long labels my_label_func = function(x){gsub('(.{1,20})(\\s|$)', '\\1\n', x)} # get the label positions and create the inside plot labels df_label1 <- temp_relFreq %>% arrange(desc(CurrentPosition)) %>% mutate(lab_ypos = cumsum(percent) - 0.5*percent, lab_label = my_label_func( paste0(CurrentPosition,': ',percent,'%'))) %>% filter(n > 10) df_label2 <- temp_relFreq %>% arrange(desc(CurrentPosition)) %>% mutate(lab_ypos = cumsum(percent) - 0.5*percent, lab_label = my_label_func(paste0(CurrentPosition,': ',percent, '%'))) %>% filter(n <= 10) mycol <- yarrr::piratepal("xmen", plot.result = TRUE, trans = .1) ## No 'x' mapping; bars of constant width; polar coordinates with theta ## applied to the Y axis F1B = ggplot(data=temp_relFreq, aes(x=factor(1),fill = CurrentPosition)) + geom_bar(width = 1) + coord_polar(theta = "y") + scale_fill_brewer(palette = "Accent") F1B df = temp_relFreq p <- ggplot(df, aes(x=1, y=percent, fill=CurrentPosition)) + #geom_histogram(stats = "identity") + geom_bar(stat="identity") + ggtitle(paste0('Current Position (n = ', sum(df$n))) + coord_polar(theta='y') + scale_fill_brewer(palette = "Set3") print(p) p <- p + geom_bar(stat="identity", color='black') p <- p + theme(axis.ticks=element_blank(), # the axis ticks axis.title=element_blank(), # the axis labels axis.text=element_blank(), panel.grid = element_blank()) print(p) p<-p + xlab('')+ ylab('') print(p) F1B = ggplot(data=temp_relFreq) + #geom_histogram(mapping=aes(x=factor(1),y=percent,fill=CurrentPosition), #geom_histogram(mapping=aes(x=factor(1),y=percent,fill=CurrentPosition), geom_histogram(mapping=aes(x=factor(1),y=percent,fill=CurrentPosition), #geom_histogram(mapping=aes(x=reorder(CurrentPosition,ordering),y=percent,fill=CurrentPosition), stat = 'identity', width = 1) + #coord_polar(theta = "y",start=0, clip = "on") + coord_polar(theta = "y") + #scale_x_continuous(limits = c(0,360)) + xlab('') + ylab('') + ggtitle(paste0('Current Position (n = ',sum(temp_relFreq$percent),'%)')) + theme(text = element_text(size=7), axis.ticks = element_blank(),axis.text = element_blank(),panel.grid = element_blank(),legend.position = "none") + geom_text_repel(data=df_label1,aes(x=factor(1),y = lab_ypos, label = lab_label), colour = "black", box.padding = 0.5,point.size = NA,nudge_x = 0, size = 5.8/.pt) + geom_text_repel(data=df_label2,aes(x=factor(1),y = lab_ypos, label = lab_label), colour = "black", box.padding = 0.5,point.size = NA,nudge_x = 1, size = 5.8/.pt) + scale_fill_brewer(palette = "Set3") #mycol) #Set3 F1B #F1B <- p sprintf("Absolute und relative Häufigkeiten der %s", "Current Position") print(temp_relFreq, quote = TRUE, row.names = FALSE) #png('CurrentPosition.png', width = 30, height = 20, units = "cm", res = 300) ragg::agg_tiff("Fig1B.tiff", width = 8, height = 8, units = "cm", res = 600, scaling = 1) F1B dev.off() # make a Neuro Discipline plot # Neuro Discipline questions are Yes/No questions ==> just need the ones who answered with Yes temp = data %>% select(Response.ID,NeuroDisciplineCat,NeuroDiscipline) %>% filter(NeuroDiscipline != 'No') %>% na.omit() %>% unique() %>% group_by(NeuroDiscipline) %>% filter(n() >= 3) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"imaging", "Imaging")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"behavioral", "Behav.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"neuroscience", "Neurosci.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Neuroscience", "Neurosci.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. electrophysiological recording behavior tracking ", "")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. patient involvement clinical trials ", "")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"science", "Science")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. modeling simulation ", "")) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(percent = round(n / sum(temp_relFreq$n)*100)) %>% arrange(desc(percent)) #temp_relFreq2 = temp %>% # group_by_at(vars(-Response.ID, -NeuroDiscipline)) %>% # summarise(n = n()) %>% # mutate(percent = n / 218) %>% # arrange(desc(percent)) #print(temp_relFreq2, quote = TRUE, row.names = FALSE) F1A = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=reorder(NeuroDisciplineCat,percent),y=percent), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + #xlab('') + ylab('') + ggtitle(paste0('Neuro Disciplines \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) + xlab('') + ylab('') + ggtitle(paste0('Neuro Disciplines (n = ',sum(temp_absNumbers$n), ')')) + #lab('') + ylab('') + ggtitle(paste0('Neuro Disciplines \n(n = ',sum(temp_absNumbers$n),)) + geom_text(aes(x=NeuroDisciplineCat,y = percent, label = paste0(" ", percent,'%')), colour = "white",hjust=1.2, size = 8/.pt) + scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20)) + #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels theme(text = element_text(size=8)) F1A ragg::agg_tiff("Fig1A.tiff", width = 8, height = 8, units = "cm", res = 600, scaling = 1) F1A #plt dev.off() # plot both graphs into one figure F1<-plot_grid(F1A,F1B,nrow = 1,ncol = 2,align = "h",axis = "lr",scale = 1,rel_widths = c(1,1)) F1 ragg::agg_tiff("Fig1.tiff", width = 17.5, height = 8, units = "cm", res = 600, scaling = 1) #tiff('Fig1_CurrentntPosition_AND_NeuroDiscipline.tiff', width = 30, height = 20, units = "cm", res = 300) #png('CurrentntPositionNeuroDiszipline.png', width = 30, height = 20, units = "cm", res = 300) F1 dev.off() #################################################### #### Figure 2 ###################################### ############### #### General Data Sharing ########################## data0 = neuro_data %>% dplyr::select(Response.ID, I.work.at...I.am.affiliated.with., My.current..primary..position.is., starts_with('Have.you.shared.data.with'), starts_with('Do.you.have.existing.data.sets.'), starts_with('Which.neuroscience.discipline.s.'), starts_with('Please.state.if.your.') ) colnames(data0)[which(names(data0) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" colnames(data0)[which(names(data0) == 'I.work.at...I.am.affiliated.with.')] <- "Work_Place" colnames(data0)[which(names(data0) == 'My.current..primary..position.is.')] <- 'CurrentPosition' comb_string_vec = c('Which.neuroscience.discipline.s.', 'Please.state.if.your.', 'Have.you.shared.data.with') comb_col_names = c('NeuroDiscipline', 'FimilarDataTypes', 'DataSharing') # Diese Schleife ist mit Vorsicht zu genießen. Entstehende long format Datensatz kann sehr groß werden library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 temp2 = neuro_data %>% select(Response.ID, Have.you.shared.data.with.....External.collaborators.) %>% na.omit() ### # DAta Sharing #temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes) %>% filter(FimilarDataTypes != 'No') %>% # na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 3) temp = data %>% select(Response.ID,DataSharingCat,DataSharing) %>% filter(DataSharing != 'No') %>% na.omit() %>% unique() %>% group_by(DataSharing) %>% filter(n() >= 3) #temp = data %>% select(Response.ID,DataSharingCat,DataSharing) %>% filter() # na.omit() %>% unique() %>% group_by(DataSharing) %>% filter(n() >= 3) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(percent = round(n/144*100,0)) temp_relFreq = temp %>% group_by_at(vars(-Response.ID, -DataSharing)) %>% summarise(n = n()) %>% mutate(percent = n / 144) p = ggplot(data=temp_absNumbers) + #geom_histogram(mapping=aes(x=reorder(TaskStandardToolsCat, percent),y= percent), geom_histogram(mapping=aes(x=reorder(DataSharingCat, percent) ,y=percent), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + theme(text = element_text(size=11)) + #theme(plot.margin = unit(c(0.5,0.2,0.2,5), "cm")) + xlab('') + ylab('') + ggtitle(paste0('Datasharing') ) + geom_text(aes(x=DataSharingCat,y = percent, label = paste0(percent,'%')), colour = "white",hjust=1.2) + scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20)) #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) + # nice regular expression solution for multiple lined labels # die theme text groesse ist unabhaengig von der Hoehe der Grafik sie bezieht sich auf die resolution # 600 pro cm #In the theme, the size is defined in pts. So here 15, means 15 pts. In geom_text, the size is defined in mm, so it's 15 mm. # #What is the relation between pts and mm or in ? If we want exactly the same size for the title and the text in the plot, how can we define it ? It needs some conversion : # # 1 pt = 1/72 in #1 pt = 0.35 mm #So if we want the text to be the same size as the title, the size in mm will be 15 pt * 0.35 pt/mm = 5.25 mm # #In ggplot, there is a constant defined to make the conversion, .pt = 2.845276. (1/.pt = 0.35). You can type in .pt in the console and it will display its value : # # ggplot2::.pt ## [1] 2.845276 #So to make the conversion : # # from pt to mm : mm = pt / .pt -> 15 / 2.845276 = 5.27 #from mm to pt : pt = mm * .pt -> 5.27 * 2.845276 = 15 #Let's change the size of the geom_text to be the same of the title by using size = 15/.pt : # # plt <- penguins %>% # ggplot(aes(bill_length_mm, bill_depth_mm, color = species)) + # geom_point()+ # geom_text(x = 45, y = 20, label = "Example of font problem", size = 15/.pt, inherit.aes = FALSE) + # labs(title = "Bill length and depth relation by species") + # theme(plot.title = element_text(size = 15)) # Affinity designer sagt 2 cm margin ... p #tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600) #pFD ragg::agg_tiff("Fig2_DataSharing3.tiff", width = 17.5, height = 6, units = "cm", res = 600, scaling = 1) p #plt dev.off() ##################################### #### # Reusing data of others ##################################### neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Have.you.shared.data.with.....Publicly. ) #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # wieviele haben keine Daten temp_absNumbers = neuro_data_tmp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) temp_absNumbers = neuro_data_tmp %>% na.omit %>% filter(!Existing_Data == 'I have no datasets') %>% group_by_at(vars(Existing_Data)) %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat("Von den Antwortenden die mindestens einen Datensatz haben ... wieviel Prozent haben diesen verfuegbar gemacht") cat(temp_absNumbers$n[3], '(', round(temp_absNumbers$percent[3],3)*100, '%) haben diese DAten verfuegbar gemacht') print(temp_absNumbers, quote = TRUE, row.names = FALSE) temp_absNumbers = neuro_data_tmp %>% na.omit %>% filter(!Existing_Data == 'I have no datasets') %>% group_by_at(vars(Other_can_answer)) %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat("Von den Antwortenden die mindestens einen Datensatz haben ... ") cat( temp_absNumbers$n[3], '(', round(temp_absNumbers$percent[3],3)*100, '%) , of all respondents that have at least one dataset are of the opinion that other researchers could answer their own research questions by re-using data from their research.') temp_absNumbers = neuro_data_tmp %>% na.omit %>% #filter(!Existing_Data == 'I have no datasets') %>% filter(Other_can_answer=='Yes') %>% group_by_at(vars(Shared_Publicly)) %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) S1 = 'However, even for this subgroup, of scientists in possession of data of which they think are valuable to others ' S2 = '% have never shared any of their data publicly.' cat(S1, round(temp_absNumbers$percent[1],3)*100, S2) ########################################################## ########################################################## # Research data management skills are essential for preparing, analyzing, and publicly sharing data. # Only 18% think that they have proficiency in research data management and only 34% think that they # know which research data management methods are available. # Interestingly, 58% of all respondents nevertheless think that they can handle their research data # according to community standards. This could be due to the availability of data research managers # who help in data handling. # However, only 25 (20%) of participants have dedicated personnel with research data management # or data curation expertise. neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise., What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards., Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner., What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management., What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field., What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available., Have.you.shared.data.with.....Publicly. ) #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # I_know_how_to_publish_my_data_reproducible colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible' # I_have_RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal' # I_can_handle_RD_community_standards colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards' # I_have_proficiency_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM' # Iam_highly_knowledgeable_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM' # I_know_RDM_available_Methods colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' S1 = 'Research data management skills are essential for preparing, analyzing, and publicly sharing data. /n Only ' S2 = 'think that they have proficiency in research data management and only ' S3 = 'think that they know which research data management methods are available. /n Interestingly, ' S4 = 'of all respondents nevertheless think that they can handle their research data according to community standards. This could be due to the availability of data research managers who help in data handling. However, only ' S5 = 'of participants have dedicated personnel with research data management or data curation expertise.' cat(S1) agree = c('Fully agree', 'Rather agree') disagree = c('Fully disagree', 'Rather disagree') not_agree = c('Fully agree', 'Rather agree', 'Undecided') not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided') df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " agree"), "Agree")) %>% mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " disagree"), "Disagree")) %>% group_by_at(vars(I_have_proficiency_in_RDM)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_have_proficiency_in_RDM=='Agree'], '(', round(df_np$percent[df_np$I_have_proficiency_in_RDM=='Agree'],3)*100, '%)') cat(S2) df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " agree"), "Agree")) %>% mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " disagree"), "Disagree")) %>% group_by_at(vars(I_know_RDM_available_Methods)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_know_RDM_available_Methods=='Agree'], '(', round(df_np$percent[df_np$I_know_RDM_available_Methods=='Agree'],3)*100, '%)') cat(S3) df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " agree"), "Agree")) %>% mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " disagree"), "Disagree")) %>% group_by_at(vars(I_can_handle_RD_community_standards)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_can_handle_RD_community_standards=='Agree'], '(', round(df_np$percent[df_np$I_can_handle_RD_community_standards=='Agree'],3)*100, '%)') cat(S4) df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% group_by_at(vars(I_have_RDM_personal)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_have_RDM_personal=='Yes, in my lab'], '(', round(df_np$percent[df_np$I_have_RDM_personal=='Yes, in my lab'],3)*100, '%)') cat(S5) ########################################################### ########################################################### # Use of tools and standards ########################################################### # # We inquired about the use of existing tools and standards for different research # data management activities, if this process step was relevant for the participants. neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise., What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards., Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner., What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management., What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field., What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available., Have.you.shared.data.with.....Publicly., starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.') ) #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # I_know_how_to_publish_my_data_reproducible colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible' # I_have_RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal' # I_can_handle_RD_community_standards colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards' # I_have_proficiency_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM' # Iam_highly_knowledgeable_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM' # I_know_RDM_available_Methods colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree') data0 <- neuro_data_tmp library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 S1 = 'We inquired about the use of existing tools and standards for different research data man-agement activities, if this process step was relevant for the participants. ' cat(S1) agree = c('Fully agree', 'Rather agree') disagree = c('Fully disagree', 'Rather disagree') not_agree = c('Fully agree', 'Rather agree', 'Undecided') not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided') df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " agree"), "Agree")) %>% mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " disagree"), "Disagree")) %>% group_by_at(vars(I_have_proficiency_in_RDM)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_have_proficiency_in_RDM=='Agree'], '(', round(df_np$percent[df_np$I_have_proficiency_in_RDM=='Agree'],3)*100, '%)') cat(S2) df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " agree"), "Agree")) %>% mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " disagree"), "Disagree")) %>% group_by_at(vars(I_know_RDM_available_Methods)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_know_RDM_available_Methods=='Agree'], '(', round(df_np$percent[df_np$I_know_RDM_available_Methods=='Agree'],3)*100, '%)') cat(S3) df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " agree"), "Agree")) %>% mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " disagree"), "Disagree")) %>% group_by_at(vars(I_can_handle_RD_community_standards)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_can_handle_RD_community_standards=='Agree'], '(', round(df_np$percent[df_np$I_can_handle_RD_community_standards=='Agree'],3)*100, '%)') cat(S4) df_np = neuro_data_tmp %>% #select(I_have_proficiency_in_RDM) %>% group_by_at(vars(I_have_RDM_personal)) %>% na.omit %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat(df_np$n[df_np$I_have_RDM_personal=='Yes, in my lab'], '(', round(df_np$percent[df_np$I_have_RDM_personal=='Yes, in my lab'],3)*100, '%)') cat(S5) ###################################################################### #################################################### #### Figure 3 ###################################### ############### #### Tools ########################## # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% droplevels() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / 159) ### # Use of standard Tools temp_absNumbers = temp %>% group_by_at(vars(TaskStandardToolsCat, TaskStandardTools)) %>% summarise(n = n()) %>% mutate(percent = round(n /sum(n)*100,0)) %>% #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% filter(TaskStandardTools != 'No') %>% filter(TaskStandardToolsCat != ' Simulation ') %>% arrange(percent) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=reorder(TaskStandardToolsCat, percent),y= percent), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"), # axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"), # axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"), # axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) + theme(text = element_text(size=11)) + xlab('') + ylab('') + ggtitle(paste0('Use of Standard Tools for ...')) + geom_text(aes(x=TaskStandardToolsCat,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20)) #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pFD #tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600) #pFD ragg::agg_tiff("Fig3.tiff", width = 17.5, height = 10, units = "cm", res = 600, scaling = 1) pFD #plt dev.off() ###################################################################### ###################################################################### #################################################### #### Figure 4 ###################################### ############### #### Tools vs. DataSharing ######################### # recreate different datasets # more elegant neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise., What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards., Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner., What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management., What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field., What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available., Have.you.shared.data.with.....Publicly., starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Think.of.data.sharing.') ) #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # I_know_how_to_publish_my_data_reproducible colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible' # I_have_RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal' # I_can_handle_RD_community_standards colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards' # I_have_proficiency_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM' # Iam_highly_knowledgeable_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM' # I_know_RDM_available_Methods colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Think.of.data.sharing.') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'ThinkSharingData') library(data.table) data0 = neuro_data_tmp for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% select(Response.ID,Shared_Publicly, TaskStandardToolsCat, TaskStandardTools) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% filter( TaskStandardToolsCat != "No") %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% droplevels() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID, Shared_Publicly,-TaskStandardToolsCat)) %>% summarise(n = n()) %>% mutate(share = n / 159) no = temp_relFreq %>% filter(Shared_Publicly=="No" ) %>% mutate(procent = n/sum(n)*100) yes = temp_relFreq %>% filter(Shared_Publicly=="Yes" ) %>% mutate(procent = n/sum(n)*100) cat("Althoug the use of standard tools are in very different areas there is") cat("the trend that those how generally use more standard tools are more likely to share their data.") cat("In the group that did not share their data publicly only") cat(no$procent[2], "% use standard Tools. While in the group who shares data ") cat(yes$procent[2], "% use standard Tools. ") cat("A possible explanation could be that scientists who work a lot with standard tools find it easier to overcome the heavily standardized rules of public sharing of data. ") cat("Formally, of course, it cannot be excluded that the dominant causality is opposite. However, we consider it unlikely that the motivation to share data is the main driver for a general affinity to use standard methods. ") ### # Use of standard Tools Degree temp = data %>% #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% select(Response.ID,Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree) %>% filter(!grepl('Comment|Other',TaskStandardToolsDegreeCat)) %>% filter( TaskStandardToolsDegreeCat != "No") %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% droplevels() temp_absNumbers = 0 temp_absNumbers = temp %>% group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% summarise(n = n()) %>% mutate(percent = round(n /sum(n)*100,0)) %>% #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>% filter(TaskStandardToolsDegree == 'Mostly') %>% # filter(Shared_Publicly == "Yes") %>% filter(TaskStandardToolsDegreeCat != ' Simulation ') %>% group_by_at(vars(Shared_Publicly)) # arrange(percent) temp_absNumbers_all = temp %>% group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% summarise(n = n()) %>% mutate(percent = round(n /sum(n)*100,0)) %>% #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>% #filter(TaskStandardToolsDegree == 'Mostly') %>% # filter(Shared_Publicly == "Yes") %>% filter(TaskStandardToolsDegreeCat != ' Simulation ') %>% group_by_at(vars(Shared_Publicly)) # arrange(percent) yes = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="Yes"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="Yes"])*100 no = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="No"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="No"])*100 cat(yes, " % answer mostly if they share their data while only ") cat(no, " % using mostly standard methods for their work if they did not share their data openly") cat("Respondents who share their data publicly have a " , (yes-no)/no*100, "% higher rate to using 'mostly' standard tools in their daily work") #################################################### #### ###################################### ############### #### Obstacles for Data sharing DataSharing ######################### data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('Please.indicate.'), starts_with('How.do.you.process.and.analyze.your.data.'), ) comb_string_vec = c('Please.indicate.', 'How.do.you.process.and.analyze.your.data.', 'Have.you.shared.data.with') comb_col_names = c('SharingProblems', 'HowAnalyzeData', 'DataSharing') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 S1 = 'We inquired about the use of existing tools and standards for different research data man-agement activities, if this process step was relevant for the participants. ' cat(S1) agree = c('Fully agree', 'Rather agree') disagree = c('Fully disagree', 'Rather disagree') not_agree = c('Fully agree', 'Rather agree', 'Undecided') not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided') temp = data %>% select(Response.ID,SharingProblemsCat,SharingProblems) %>% mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " agree"), "Agree")) %>% mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " disagree"), "Disagree")) %>% na.omit %>% group_by_at(vars(SharingProblemsCat, SharingProblems)) %>% summarise(n = n()) %>% mutate(percent = round(n / sum(n)*100 ,0)) ownership = 0 ownership <- temp %>% filter(SharingProblemsCat == " I do not want to use a public repository because my data ownership intellectual property might be violated ") #%>% institution <- temp %>% filter(SharingProblemsCat == " My institutional policy allows to upload data to a public repository ") #%>% legal <- temp %>% filter(SharingProblemsCat == ' Legal aspects licensing national laws are significant hurdles for public repository usage ') rights <- temp %>% filter(SharingProblemsCat == ' For my research project s I am unsure if I own the rights to upload the data to a public repository ') sufficient_guidance<- temp%>% filter(SharingProblemsCat == ' There is sufficient guidance towards choosing an appropriate repository for my data ') time <- temp %>% filter(SharingProblemsCat == ' There is a lack of time to deposit data in a repository ') expertise <- temp %>% filter(SharingProblemsCat == ' There is a lack of expertise and human resources to deposit data in a repository ') technic <- temp %>% filter(SharingProblemsCat == ' Technical hurdles are too high to upload to a repository large data transfer lack of requested metadata ') #their_way <- temp %>% # filter(SharingProblemsCat == ' For my research project s I am unsure if I own the rights to upload the data to a public repository ') #filter(SharingProblems == "Agree") #%>% #$select(percent) # temp$n[temp$SharingProblemsCat==" I do not want to use a public repository because my data ownership intellectual property might be violated " && SharingProblems == "Agree"] #First of all, we did not find major general opposition to public data sharing. cat( ownership$percent[ownership$SharingProblems=="Agree"] ,'% are reluctant to share data publicly because the data ownership or intellectual property might be violated (vs. ', ownership$percent[ownership$SharingProblems=="Disagree"] , ').') cat('Interestingly, ', institution$percent[institution$SharingProblems=="Undecided"], '% participants did not know whether their institutional policy allow to up-load data to a public repository') cat(' while further ', institution$percent[institution$SharingProblems=="Disargree"],'% are sure that they did not' ) cat(institution$percent[institution$SharingProblems=="Disargree"]) cat('Further ', 100-rights$percent[rights$SharingProblems=="Disagree"], ' are not sure whether they own the rights to upload data from their own research project') cat(legal$percent[legal$SharingProblems=="Agree"], ' %) see legal aspects as significant hurdles for public repository usage.') cat('These answers indicate major uncertainties with regard to legal issues.') cat('Only ', sufficient_guidance$percent[sufficient_guidance$SharingProblems=="Agree"], '% think that there is sufficient guidance towards choosing an appropriate repository for my data') cat(time$percent[time$SharingProblems=="Agree"], '% think that there is a lack of time to deposit data in a repository.') cat('while only ',time$percent[time$SharingProblems=="Disagree"], '% disagree on this point') cat(expertise$percent[expertise$SharingProblems=="Agree"], "% think that there is a lack of expertise and human resources to deposit data in a repository") cat(technic$percent[technic$SharingProblems=="Agree"], "% think that the technical hurdles are too high to uplad tdat ato a repository") cat('83% of respondents did not think that their research data must be handled in their very own, individual way. The lack of professional data management was reported as problem. 70 (54%) participants think that they would share more of their data if they had better data man-agement while only 32% think that a better data management would not increase the amount of own data to share. Due to the lack of professional data management, the preparation of an dataset for public use is a time-consuming process. 70% of those respondents how have previously prepared data for publication and re-use say that the time that they need to ready a dataset requires more than a day while 39% need even more than a week. Accordingly, 76 (60%) think that there is a lack of time to deposit data in a repository while only 31 (24%) did not think that time is a problem for the deposition of data in a public repository.') ###################################################################### ###################################################################### # recreate different datasets # more elegant neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise., What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards., Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner., What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management., What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field., What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available., Have.you.shared.data.with.....Publicly., My.current..primary..position.is., starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Which.neuroscience.discipline.s.'), starts_with('Think.of.data.sharing.') ) #CurrentPosition colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'My.current..primary..position.is.')] <- "CurrentPosition" #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # I_know_how_to_publish_my_data_reproducible colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible' # I_have_RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal' # I_can_handle_RD_community_standards colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards' # I_have_proficiency_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM' # Iam_highly_knowledgeable_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM' # I_know_RDM_available_Methods colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Which.neuroscience.discipline.s.', 'Think.of.data.sharing.') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'NeuroDiscipline', 'ThinkSharingData') library(data.table) data0 = neuro_data_tmp for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% select(Response.ID,Shared_Publicly, CurrentPosition) %>% #filter(!grepl('Comment|Other',CurrentPositionCat)) %>% #filter( CurrentPositionCat != "No") %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% #na.omit() %>% unique() #%>% #droplevels() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID, Shared_Publicly)) %>% summarise(n = n()) %>% #mutate(share = n / 159) %>% filter(n >= 3) %>% # calc abs numbers to make more own plots na.omit() library(reshape2) # here Plot mit 2 Farben ############################# # hier weiter machen # alle gleiche Current Position auf 100 und dann den Anteil die Daten Teilen als % #data2 <- reshape(temp_relFreq, idvar = "CurrentPosition", timevar = "Shared_Publicly", direction = "wide") temp_absNumbers <- dcast(temp_relFreq, CurrentPosition ~ Shared_Publicly) %>% mutate(percent = round(Yes/ (No + Yes)*100,0)) %>% na.omit() %>% arrange(desc(percent)) %>% mutate(CurrentPosition = str_replace(CurrentPosition,"Research data management focused staff", "RDM staff")) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=reorder(CurrentPosition, percent),y= percent), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"), # axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"), # axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"), # axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) + theme(text = element_text(size=11)) + xlab('') + ylab('') + ggtitle(paste0('Datasharing for different scientific positions')) + geom_text(aes(x=CurrentPosition,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20)) #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pFD #tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600) #pFD ragg::agg_tiff("Fig5_Position_vs_Sharing.tiff", width = 17.5, height = 6, units = "cm", res = 600, scaling = 1) pFD #plt dev.off() temp = data %>% #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% select(Response.ID,Shared_Publicly, NeuroDisciplineCat, NeuroDiscipline) %>% filter(NeuroDiscipline == "Yes") %>% #filter(!grepl('Comment|Other',CurrentPositionCat)) %>% #filter( CurrentPositionCat != "No") %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% select(-NeuroDiscipline) %>% na.omit() %>% unique() #%>% #droplevels() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID, Shared_Publicly)) %>% summarise(n = n()) %>% #mutate(share = n / 159) %>% filter(n >= 3) %>% # calc abs numbers to make more own plots na.omit() library(reshape2) # here Plot mit 2 Farben ############################# # hier weiter machen # alle gleiche Current Position auf 100 und dann den Anteil die Daten Teilen als % #data2 <- reshape(temp_relFreq, idvar = "CurrentPosition", timevar = "Shared_Publicly", direction = "wide") temp_absNumbers <- dcast(temp_relFreq, NeuroDisciplineCat ~ Shared_Publicly) %>% mutate(percent = round(Yes/ (No + Yes)*100,0)) %>% na.omit() %>% arrange(desc(percent)) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"imaging", "Imaging")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"behavioral", "Behav.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"neuroscience", "Neurosci.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Neuroscience", "Neurosci.")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. electrophysiological recording behavior tracking ", "")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. patient involvement clinical trials ", "")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"science", "Science")) %>% mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. modeling simulation ", "")) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=reorder(NeuroDisciplineCat, percent),y= percent), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"), # axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"), # axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"), # axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) + theme(text = element_text(size=11)) + xlab('') + ylab('') + ggtitle(paste0('Datasharing for different neuroscientific subdisciplines')) + geom_text(aes(x=NeuroDisciplineCat,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 40)) #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pFD #tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600) #pFD ragg::agg_tiff("Fig6_Discipline_vs_Sharing.tiff", width = 17.5, height = 10, units = "cm", res = 600, scaling = 1) pFD #plt dev.off() no = temp_relFreq %>% filter(Shared_Publicly=="No" ) %>% mutate(procent = n/sum(n)*100) yes = temp_relFreq %>% filter(Shared_Publicly=="Yes" ) %>% mutate(procent = n/sum(n)*100) cat("Althoug the use of standard tools are in very different areas there is") cat("the trend that those how generally use more standard tools are more likely to share their data.") cat("In the group that did not share their data publicly only") cat(no$procent[2], "% use standard Tools. While in the group who shares data ") cat(yes$procent[2], "% use standard Tools. ") cat("A possible explanation could be that scientists who work a lot with standard tools find it easier to overcome the heavily standardized rules of public sharing of data. ") cat("Formally, of course, it cannot be excluded that the dominant causality is opposite. However, we consider it unlikely that the motivation to share data is the main driver for a general affinity to use standard methods. ") ### # Use of standard Tools Degree temp = data %>% #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% select(Response.ID,Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree) %>% filter(!grepl('Comment|Other',TaskStandardToolsDegreeCat)) %>% filter( TaskStandardToolsDegreeCat != "No") %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% droplevels() temp_absNumbers = 0 temp_absNumbers = temp %>% group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% summarise(n = n()) %>% mutate(percent = round(n /sum(n)*100,0)) %>% #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>% filter(TaskStandardToolsDegree == 'Mostly') %>% # filter(Shared_Publicly == "Yes") %>% filter(TaskStandardToolsDegreeCat != ' Simulation ') %>% group_by_at(vars(Shared_Publicly)) # arrange(percent) temp_absNumbers_all = temp %>% group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% summarise(n = n()) %>% mutate(percent = round(n /sum(n)*100,0)) %>% #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>% #filter(TaskStandardToolsDegree == 'Mostly') %>% # filter(Shared_Publicly == "Yes") %>% filter(TaskStandardToolsDegreeCat != ' Simulation ') %>% group_by_at(vars(Shared_Publicly)) # arrange(percent) yes = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="Yes"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="Yes"])*100 no = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="No"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="No"])*100 cat(yes, " % answer mostly if they share their data while only ") cat(no, " % using mostly standard methods for their work if they did not share their data openly") cat("Respondents who share their data publicly have a " , (yes-no)/no*100, "% higher rate to using 'mostly' standard tools in their daily work") ###################################################################### ###################################################################### # Zusammenhangsanalyse mit dem Data Sharing ... was hat einen Einfluss auf data Sharing # recreate different datasets # more elegant neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise., What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards., Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner., How.much.time.do.you.currently.need.to.ready.a.dataset.from.your.lab.for.publication.and.re.use., Have.you.shared.data.with.....Publicly., starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Think.of.data.sharing.') ) #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # I_know_how_to_publish_my_data_reproducible colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible' # I_have_RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal' # I_can_handle_RD_community_standards colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards' # I_have_proficiency_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM' # Iam_highly_knowledgeable_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM' # I_know_RDM_available_Methods colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods' # how_much_time colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'How.much.time.do.you.currently.need.to.ready.a.dataset.from.your.lab.for.publication.and.re.use.')] <- 'how_much_time' comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Think.of.data.sharing.') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'ThinkSharingData') library(data.table) data0 = neuro_data_tmp for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 agree = c('Fully agree', 'Rather agree') disagree = c('Fully disagree', 'Rather disagree') not_agree = c('Fully agree', 'Rather agree', 'Undecided') not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided') temp_absNumbers = data %>% select(Response.ID, how_much_time) %>% #select(I_have_proficiency_in_RDM) %>% # mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " agree"), "Agree")) %>% # mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " disagree"), "Disagree")) %>% group_by_at(vars(how_much_time)) %>% na.omit %>% unique() %>% # droplevels() %>% summarise(n = n()) %>% mutate(percent = round(n / sum(n)*100,0)) %>% arrange(percent) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=reorder(how_much_time, percent),y= percent), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"), # axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"), # axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"), # axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) + theme(text = element_text(size=11)) + xlab('') + ylab('') + ggtitle(paste0('Time needed to ready a dataset for publication and reuse')) + geom_text(aes(x=how_much_time,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20)) #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pFD #tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600) #pFD ragg::agg_tiff("Fig4_Time.tiff", width = 17.5, height = 7, units = "cm", res = 600, scaling = 1) pFD #plt dev.off() ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### # What is the factor that most strongly seperates shareers from non-sharers # Try something # recreate different datasets # more elegant neuro_data_tmp = 0 neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID, Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research.., Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse., Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise., What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards., Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner., What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management., What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field., What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available., Have.you.shared.data.with.....Publicly., My.current..primary..position.is., starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Which.neuroscience.discipline.s.'), starts_with('Applying.research.data.management..'), starts_with('Think.of.data.sharing.') ) #CurrentPosition colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'My.current..primary..position.is.')] <- "CurrentPosition" #Existing_Data colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data" # Other_can_answer colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer" # Shared_Publicly colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly" # I_know_how_to_publish_my_data_reproducible colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible' # I_have_RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal' # I_can_handle_RD_community_standards colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards' # I_have_proficiency_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM' # Iam_highly_knowledgeable_in_RDM colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM' # I_know_RDM_available_Methods colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods' # RDM_personal colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'RDM_personal' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' #colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- '' comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Which.neuroscience.discipline.s.', 'Think.of.data.sharing.') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'NeuroDiscipline', 'ThinkSharingData') library(data.table) data0 = neuro_data_tmp for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # # I_can_handle_RD_community_standards # I_have_proficiency_in_RDM # Iam_highly_knowledgeable_in_RDM # I_know_RDM_available_Methods # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% select(Response.ID, Shared_Publicly, CurrentPosition, I_can_handle_RD_community_standards, I_have_proficiency_in_RDM, Iam_highly_knowledgeable_in_RDM, I_know_RDM_available_Methods, I_have_RDM_personal) %>% unique() # mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " agree"), "Agree")) %>% # mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " disagree"), "Disagree")) %>% #filter(!grepl('Comment|Other',CurrentPositionCat)) %>% #filter( CurrentPositionCat != "No") %>% #filter(TaskStandardTools != 'No') %>% #filter(TaskStandardTools != 'No') %>% #na.omit() %>% #unique() #%>% #droplevels() df = temp df <- data.frame(lapply(df, function(x) { gsub("Fully agree", "Agree", x)})) df <- data.frame(lapply(df, function(x) { gsub("Rather agree", "Agree", x)})) df <- data.frame(lapply(df, function(x) { gsub("Rather disagree", "Disagree", x)})) df <- data.frame(lapply(df, function(x) { gsub("Fully disagree", "Disagree", x)})) # calc relative frequency to make more own plots for(s in seq(3,length(df),1)){ print(colnames(df)[s]) df_tmp = df[,c(1,2,s)] temp_relFreq = df_tmp %>% group_by_at(vars(-Response.ID, Shared_Publicly)) %>% summarise(n = n()) %>% filter(n >= 3) %>% na.omit() %>% mutate(percent = round(n/sum(n)*50,0)) print(temp_relFreq) cat("Agree and Shared = ", temp_relFreq$n[temp_relFreq$Shared_Publicly=="Yes" & temp_relFreq[,2]=="Agree"] / temp_relFreq$n[temp_relFreq$Shared_Publicly=="Yes" & temp_relFreq[,2]=="Disagree"] ) } # diese resultierenden Tables habe ich mir dann per hand angeschaut # und das Verhaeltnis zwischen Yes Agree und Yes Disagree ausgerechnet # und ins paper geschrieben ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### ###################################################################### # recreate different datasets # more elegant data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Think.of.data.sharing.') ) comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Think.of.data.sharing.', 'Have.you.shared.data.with') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'ThinkSharingData', 'DataSharing') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% droplevels() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) ### # DAta Sharing #temp = data %>% select(Response.ID,DataSharingCat,DataSharing) %>% filter(DataSharing != 'No') %>% # na.omit() %>% unique() %>% group_by(DataSharing) %>% filter(n() >= 3) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(TaskStandardToolsCat, TaskStandardTools)) %>% summarise(n = n()) %>% mutate(percent = n /sum(n)) temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) temp_relFreq = temp %>% group_by_at(vars(-Response.ID, -DataSharing)) %>% summarise(n = n()) %>% mutate(percent = n / 144) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=DataSharingCat,y=n), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + xlab('') + ylab('') + ggtitle(paste0('Datasharing (n = ',sum(temp_absNumbers$n),')')) + geom_text(aes(x=DataSharingCat,y = n, label = n), colour = "white",hjust=1.5) + scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pFD tiff('Fig2_DataSharing.tiff', width = 17.5, height = 7, units = "cm", res = 600) pFD dev.off() pTST = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(TaskStandardTools~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('For.which.of.these.tasks.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300) pTST dev.off() # make a Task Standard Tools Degree plot temp = data %>% select(Response.ID,TaskStandardToolsDegreeCat,TaskStandardToolsDegree,DataSharingCat,DataSharing) %>% filter(!grepl('Other',TaskStandardToolsDegreeCat)) %>% na.omit() %>% unique() # combine some levels temp$TaskStandardToolsDegree = as.character(temp$TaskStandardToolsDegree) temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'As much as possible' | temp$TaskStandardToolsDegree == 'Mostly'] = 'Offten' temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'Occasionally' | temp$TaskStandardToolsDegree == 'This is not relevant for my scientific work'] = 'Rare' temp$TaskStandardToolsDegree = as.factor(temp$TaskStandardToolsDegree) # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTSD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=TaskStandardToolsDegreeCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(TaskStandardToolsDegree~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('To.what.degree.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300) pTSD dev.off() # make a Think of Reusing Data plot temp = data %>% select(Response.ID,ThinkReusingDataCat,ThinkReusingData,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTRD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=ThinkReusingDataCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(ThinkReusingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Think.of.re.using.data.from.repositories.png', width = 30, height = 20, units = "cm", res = 300) pTRD dev.off() # make a Think of Sharing Data plot temp = data %>% select(Response.ID,ThinkSharingDataCat,ThinkSharingData,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTSD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=ThinkSharingDataCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(ThinkSharingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Think.of.sharing.with.researchers.who.are.NOT.direct.collaborators.png', width = 30, height = 20, units = "cm", res = 300) pTSD dev.off() #################################################################### # make a Fimiliar Data Types plot # Fimiliar Data Types are Yes/No questions ==> just need the ones who answered with Yes temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes) %>% filter(FimilarDataTypes != 'No') %>% na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 3) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=FimilarDataTypesCat,y=n), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + xlab('') + ylab('') + ggtitle(paste0('Fimilar Datatypes \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) + geom_text(aes(x=FimilarDataTypesCat,y = n, label = n), colour = "white",hjust=1.5) + scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels #png('Please.state.if.your.work.includes.png', width = 30, height = 20, units = "cm", res = 300) pFD #dev.off() ### # DAta availability temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes) %>% filter(FimilarDataTypes != 'No') %>% na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 3) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) pFD = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=FimilarDataTypesCat,y=n), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + xlab('') + ylab('') + ggtitle(paste0('Fimilar Datatypes \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) + geom_text(aes(x=FimilarDataTypesCat,y = n, label = n), colour = "white",hjust=1.5) + scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Think.of.data.sharing.') ) comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Think.of.data.sharing.', 'Have.you.shared.data.with') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'ThinkSharingData', 'DataSharing') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # STandard Tools # make a Current Position plot # TaskStandardToolsCat,TaskStandardTools # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes #temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% # #filter(TaskStandardTools != 'No') %>% # na.omit() %>% unique() %>% droplevels() temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3) # temp <- factor(temp$TaskStandardToolsCat, levels = temp$TaskStandardToolsCat) # = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(TaskStandardTools != 'No') %>% # na.omit() %>% unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% arrange(desc(n)) pST = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=n), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + xlab('') + ylab('') + ggtitle(paste0('Use of Standard Tools \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) + geom_text(aes(x=TaskStandardToolsCat,y = n, label = n), colour = "white",hjust=1.5) + scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pST tiff('UseOfStandardTools.tiff', width = 30, height = 20, units = "cm", res = 300) pST dev.off() ###################################################################### ###################################################################### temp_absNumbers = neuro_data_tmp %>% na.omit %>% filter(!Existing_Data == 'I have no datasets') %>% group_by_at(vars(Other_can_answer)) %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) cat("Von den Antwortenden die mindestens einen Datensatz haben ... ") cat( temp_absNumbers$n[3], '(', round(temp_absNumbers$percent[3],3)*100, '%) , of all respondents that have at least one dataset are of the opinion that other researchers could answer their own research questions by re-using data from their research.') temp_absNumbers = neuro_data_tmp %>% na.omit %>% #filter(!Existing_Data == 'I have no datasets') %>% filter(Other_can_answer=='Yes') %>% group_by_at(vars(Shared_Publicly)) %>% summarise(n = n()) %>% mutate(percent = n / sum(n)) S1 = 'However, even for this subgroup, of scientists in possession of data of which they think are valuable to others ' S2 = '% have never shared any of their data publicly.' cat(S1, round(temp_absNumbers$percent[1],3)*100, S2) ############################################################# # von denen die DAten haben wieviele denken das diese DAten auch fuer andere Sinnvoll sind? #q1 = 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..' #data3 = data[data$Existing_Data!=NA,] #Factors promoting public data sharing # To identify factors that promote public data sharing answers of participants # were filtered on whether they have already shared their data in # public repositories or not. We excluded all subject which did not have any data. #1. delete subjects which did not have any data #vec <- c("I have no datasets") #data1 <- data0[data0$Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse. %in% vec,] data1 <- neuro_data %>% filter(! Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse. == "I have no datasets") x<-colnames(neuro_data) typeof(x) length(x) data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Think.of.data.sharing.') ) comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Think.of.data.sharing.', 'Have.you.shared.data.with') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'ThinkSharingData', 'DataSharing') data0 = data1 %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with.....Publicly.'), starts_with('My.current..primary..position.is'), starts_with('Think.of.re.using.data.'), starts_with('How.much.time.do.you.currently.need') ) #data0 <- data0 %>% filter() comb_string_vec = c('Think.of.re.using.data.') comb_col_names = c('ThinkREusingData') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 datax <-melt(data, id = c()) #################################### # Time needed for ready dataset # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% select(Response.ID,TimeNeededCat,TimeNeeded,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TimeNeededCat)) %>% #filter(TimeNeededCat != 'No') %>% na.omit() %>% unique() %>% droplevels() #temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% # filter(TaskStandardTools != 'No') %>% # na.omit() %>% unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3) # = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(TaskStandardTools != 'No') %>% # na.omit() %>% unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3) View(temp) # calc abs numbers to make more own plots temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) pST = ggplot(data=temp_absNumbers) + geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=n), colour = 'darkblue', fill='darkblue', stat = 'identity', width = 0.5) + coord_flip() + xlab('') + ylab('') + ggtitle(paste0('Use of Standard Tools \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) + geom_text(aes(x=TaskStandardToolsCat,y = n, label = n), colour = "white",hjust=1.5) + scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pST tiff('UseOfStandardTools.tiff', width = 17.5, height = 7, units = "cm", res = 600) pST dev.off() ################old########################## ############################################# # more elegant data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('Do.you.have.existing.data'), starts_with('I.work.at'), starts_with('My.current..'), starts_with('Which.neuroscience.discipline.s.'), starts_with('Please.state.if.your.') ) comb_string_vec = c('I.work.at', 'My.current..', 'Which.neuroscience.discipline.s.', 'Please.state.if.your.', 'Have.you.shared.data.with', 'Do.you.have.existing.data') comb_col_names = c('WorkPlaces', 'CurrentPosition', 'NeuroDiscipline', 'FimilarDataTypes', 'DataSharing', 'ExistingData') # Diese Schleife ist mit Vorsicht zu genießen. Entstehende long format Datensatz kann sehr groß werden library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a WorkPlaces plot filter out the 'Other' answers temp = data %>% select(Response.ID,WorkPlacesCat,WorkPlaces) %>% na.omit() %>% unique() %>% group_by(WorkPlaces) %>% filter(n() >= 3) pWP = ggplot(data=temp) + geom_bar(mapping=aes(x=WorkPlaces,y=..count..),position=position_dodge()) + xlab('') + ylab('count') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1), plot.margin=unit(c(0,0,0,0), 'cm')) + #facet_grid(.~DataSharingCat,scales = 'fixed',margins = FALSE) + scale_fill_brewer(palette = 'Accent') + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('I.work.at.png', width = 30, height = 20, units = "cm", res = 300) pWP dev.off() # make a Current Position plot temp = data %>% select(Response.ID,CurrentPosition,DataSharingCat,DataSharing) %>% na.omit() %>% unique() %>% group_by(CurrentPosition) %>% filter(n() >= 3) # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pCP = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=CurrentPosition,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(.~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('My.current.position.png', width = 30, height = 20, units = "cm", res = 300) pCP dev.off() # make a Neuro Discipline plot # Neuro Discipline questions are Yes/No questions ==> just need the ones who answered with Yes ==> or? temp = data %>% select(Response.ID,NeuroDisciplineCat,NeuroDiscipline,DataSharingCat,DataSharing) %>% filter(NeuroDiscipline != 'No') %>% na.omit() %>% unique() %>% group_by(NeuroDiscipline) %>% filter(n() >= 5) # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pND = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=NeuroDisciplineCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(NeuroDiscipline~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Which.neuroscience.discipline.png', width = 30, height = 20, units = "cm", res = 300) pND dev.off() # make a Fimiliar Data Types plot # Fimiliar Data Types are Yes/No questions ==> just need the ones who answered with Yes ==> or? temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes,DataSharingCat,DataSharing) %>% #filter(FimilarDataTypes != 'No') %>% na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 5) # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pFD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=FimilarDataTypesCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(FimilarDataTypes~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Please.state.if.your.work.includes.png', width = 30, height = 20, units = "cm", res = 300) pFD dev.off() # recreate different datasets # more elegant data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('For.which.of.these.tasks.'), starts_with('To.what.degree.do.you.'), starts_with('Think.of.re.using.data.'), starts_with('Think.of.data.sharing.') ) comb_string_vec = c('For.which.of.these.tasks.', 'To.what.degree.do.you.', 'Think.of.re.using.data.', 'Think.of.data.sharing.', 'Have.you.shared.data.with') comb_col_names = c('TaskStandardTools', 'TaskStandardToolsDegree', 'ThinkReusingData', 'ThinkSharingData', 'DataSharing') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a Task Standard Tools plot # Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or? # remove Comment columes temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% #filter(TaskStandardTools != 'No') %>% na.omit() %>% unique() %>% droplevels() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTST = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(TaskStandardTools~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('For.which.of.these.tasks.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300) pTST dev.off() # make a Task Standard Tools Degree plot temp = data %>% select(Response.ID,TaskStandardToolsDegreeCat,TaskStandardToolsDegree,DataSharingCat,DataSharing) %>% filter(!grepl('Other',TaskStandardToolsDegreeCat)) %>% na.omit() %>% unique() # combine some levels temp$TaskStandardToolsDegree = as.character(temp$TaskStandardToolsDegree) temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'As much as possible' | temp$TaskStandardToolsDegree == 'Mostly'] = 'Offten' temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'Occasionally' | temp$TaskStandardToolsDegree == 'This is not relevant for my scientific work'] = 'Rare' temp$TaskStandardToolsDegree = as.factor(temp$TaskStandardToolsDegree) # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTSD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=TaskStandardToolsDegreeCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(TaskStandardToolsDegree~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('To.what.degree.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300) pTSD dev.off() # make a Think of Reusing Data plot temp = data %>% select(Response.ID,ThinkReusingDataCat,ThinkReusingData,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTRD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=ThinkReusingDataCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(ThinkReusingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Think.of.re.using.data.from.repositories.png', width = 30, height = 20, units = "cm", res = 300) pTRD dev.off() # make a Think of Sharing Data plot temp = data %>% select(Response.ID,ThinkSharingDataCat,ThinkSharingData,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pTSD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=ThinkSharingDataCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(ThinkSharingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Think.of.sharing.with.researchers.who.are.NOT.direct.collaborators.png', width = 30, height = 20, units = "cm", res = 300) pTSD dev.off() ### where are the problems ### # recreate different datasets # more elegant data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('Please.indicate.'), starts_with('How.do.you.process.and.analyze.your.data.'), ) comb_string_vec = c('Please.indicate.', 'How.do.you.process.and.analyze.your.data.', 'Have.you.shared.data.with') comb_col_names = c('SharingProblems', 'HowAnalyzeData', 'DataSharing') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a Sharing Data Problems plot temp = data %>% select(Response.ID,SharingProblemsCat,SharingProblems,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pSP = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=SharingProblemsCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(SharingProblems~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,70})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Sharing.problems.please.indicate.png', width = 40, height = 30, units = "cm", res = 300) pSP dev.off() # make a Sharing Data Problems plot temp = data %>% select(Response.ID,HowAnalyzeDataCat,HowAnalyzeData,DataSharingCat,DataSharing) %>% #filter(HowAnalyzeData != 'No') %>% na.omit() %>% unique() %>% group_by(HowAnalyzeData) %>% filter(n() >= 5) # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pHAD = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=HowAnalyzeDataCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(HowAnalyzeData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('How.do.you.process.and.analyze.your.data.png', width = 30, height = 20, units = "cm", res = 300) pHAD dev.off() # recreate different datasets # more elegant data0 = neuro_data %>% dplyr::select(Response.ID, starts_with('Have.you.shared.data.with'), starts_with('What.is.your.opinion'), starts_with('Applying.research.data.management.'), starts_with('Please.rank.the.top.'), ) comb_string_vec = c('What.is.your.opinion', 'Please.rank.the.top.', 'Applying.research.data.management.', 'Have.you.shared.data.with') comb_col_names = c('StatementsOpinion', 'TopSharingProblems', 'ApplyDataManagement', 'DataSharing') library(data.table) for(i in seq(1,length(comb_string_vec),1)){ data0 = data.table::melt(as.data.table(data0), id= c(which(!grepl(comb_string_vec[i],colnames(data0)))), measure=list(grep(comb_string_vec[i],colnames(data0))), variable.name = paste0(comb_col_names[i],'Cat'), value.name = comb_col_names[i],value.factor=TRUE) # make some nicer labels data0 = as.data.frame(data0) level_strings = levels(data0[,ncol(data0)-1]) # iterate over the level strings and update them for(s in seq(1,length(level_strings),1)){ level_string = level_strings[s] temp = str_locate(level_string, '\\.\\.\\.') level_string = substr(level_string,temp[2],nchar(level_string)) level_string = gsub('\\.|\\.\\.',' ',level_string) level_string = gsub('e g','e.g.',level_string) level_strings[s] = level_string } # reset the labels levels(data0[,ncol(data0)-1]) = level_strings } data = data0 # make a Top Sharing Data Problems plot temp = data %>% select(Response.ID,TopSharingProblemsCat,TopSharingProblems) %>% na.omit() %>% unique() pTSP = ggplot(data=temp) + geom_bar(mapping=aes(x=TopSharingProblems,y=..prop..,group=1), width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(.~TopSharingProblemsCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=25)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,70})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Please.rank.the.top.most.pressing.issues.png', width = 40, height = 20, units = "cm", res = 300) pTSP dev.off() # make a Apply Data Management plot temp = data %>% select(Response.ID,ApplyDataManagementCat,ApplyDataManagement,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pARM = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=ApplyDataManagementCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(ApplyDataManagement~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=25)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('Applying.research.data.management.png', width = 30, height = 20, units = "cm", res = 300) pARM dev.off() # make a What is your opinion plot temp = data %>% select(Response.ID,StatementsOpinionCat,StatementsOpinion,DataSharingCat,DataSharing) %>% na.omit() %>% unique() # calc relative frequency to make more own plots temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n)) pOS = ggplot(data=temp_relFreq) + geom_histogram(mapping=aes(x=StatementsOpinionCat,y=share,color=DataSharing,fill=DataSharing), stat = 'identity', width = 0.5) + xlab('') + ylab('percent (%)') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) + theme(legend.position = "left", legend.box = "vertical") + facet_grid(StatementsOpinion~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=25)) + scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels png('What.is.your.opinion.on.the.following.statements.png', width = 30, height = 20, units = "cm", res = 300) pOS dev.off() #### polar plot try #### cbp1 <- c("#000000", "#FFFFFF") temp = data %>% select(Response.ID,CurrentPosition,DataSharingCat,DataSharing) %>% na.omit() %>% unique() pCP = ggplot(data=temp) + geom_bar(mapping=aes(x=CurrentPosition,color=DataSharing,fill=DataSharingCat),width = 0.75) + xlab('') + ylab('counts') + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1), plot.margin=unit(c(0,0,0,0), 'cm')) + #facet_grid(.~DataSharingCat,scales = 'fixed',margins = FALSE) + scale_color_manual(values = cbp1) + scale_fill_brewer(palette = "Dark2") + coord_polar(theta = 'y',clip='off') + scale_x_discrete(labels=function(x){gsub('(.{1,20})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels pCP