# clean workspace
rm(list=ls())

# The csv File has to be in the same directory
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

# clear the console
cat("\014") 

# load libraries
library(ggplot2)
library(dplyr)
library(ggpubr)
library(ragg)
library(reshape2)
library(data.table)
library(stringr)
library(cowplot)
library(ggrepel)
library(forcats)
library(yarrr)
# Load the data
# catch NA strings
#neuro_data <- read.csv("results-survey197421_nurkomplett.csv",row.names=NULL,na.strings=c("","N/A"),sep=',')
neuro_data2 <- read.csv("results-survey197421_alledaten.csv",row.names=NULL,na.strings=c("","N/A"),sep=',')
neuro_data <- neuro_data2[!is.na(neuro_data2$My.current..primary..position.is.),]
# Es geht um die Frage was die Leute die Daten teilen von den anderen Unterscheidet 
# und insgesamt um die Frage was es fuer Probleme gibt in unserer Dateninfrastruktur

colnames(neuro_data)[1]<-"Response.ID"

# remove one outlier ==> empty row
neuro_data = neuro_data[neuro_data$Response.ID != 78,]

# remove whitespaces and commas
colnames(neuro_data) = str_replace_all(colnames(neuro_data)," ",".")
colnames(neuro_data) = str_replace_all(colnames(neuro_data),",",".")





#######################################################
#### Figure 1 #########################################
###############
#### Neuro Disciplines + Current Position #############
#######################################################

data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('I.work.at'),
                                     starts_with('My.current..'),
                                     starts_with('Which.neuroscience.discipline.s.'),
                                     starts_with('Please.state.if.your.')
)
comb_string_vec = c('I.work.at',
                    'My.current..',
                    'Which.neuroscience.discipline.s.',
                    'Please.state.if.your.',
                    'Have.you.shared.data.with')
comb_col_names = c('WorkPlaces',
                   'CurrentPosition',
                   'NeuroDiscipline',
                   'FimilarDataTypes',
                   'DataSharing')

# Diese Schleife ist mit Vorsicht zu genieÃŸen. Entstehende long format Datensatz kann sehr groÃŸ werden
library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

################################
# make a Current Position plot
temp = data %>% select(Response.ID,CurrentPosition) %>% na.omit() %>% unique() %>% group_by(CurrentPosition) %>% filter(n() >= 3)
# calc relative frequency to make more own plots


temp_relFreq <- 0
temp_relFreq = temp %>% 
  group_by_at(vars(-Response.ID)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n / sum(n)*100,0)) %>%
  mutate(ordering = c(1,7,5,6,3,2,4)) %>%
  arrange(ordering) %>%
  mutate(CurrentPosition = str_replace(CurrentPosition,"Research data management focused staff", "RDM staff"))
  #mutate(CurrentPosition = fct_reorder(temp_relFreq$CurrentPosition, temp_relFreq$ordering, min))


# function to wrap around long labels
my_label_func = function(x){gsub('(.{1,20})(\\s|$)', '\\1\n', x)}

# get the label positions and create the inside plot labels
df_label1 <- temp_relFreq %>%
  arrange(desc(CurrentPosition)) %>%
  mutate(lab_ypos = cumsum(percent) - 0.5*percent,
         lab_label = my_label_func(
           paste0(CurrentPosition,': ',percent,'%'))) %>% 
  filter(n > 10)

df_label2 <- temp_relFreq %>%
  arrange(desc(CurrentPosition)) %>%
  mutate(lab_ypos = cumsum(percent) - 0.5*percent,
         lab_label = my_label_func(paste0(CurrentPosition,': ',percent, '%'))) %>% filter(n <= 10)


mycol <- yarrr::piratepal("xmen", plot.result = TRUE, trans = .1)    


## No 'x' mapping; bars of constant width; polar coordinates with theta
## applied to the Y axis
F1B = ggplot(data=temp_relFreq, aes(x=factor(1),fill = CurrentPosition)) +
  geom_bar(width = 1) + 
  coord_polar(theta = "y") + 
  scale_fill_brewer(palette = "Accent")


F1B


df = temp_relFreq
p <- ggplot(df, aes(x=1, y=percent, fill=CurrentPosition)) +
  #geom_histogram(stats = "identity") +
  geom_bar(stat="identity") +
  ggtitle(paste0('Current Position (n = ', sum(df$n))) +
  coord_polar(theta='y') +
  scale_fill_brewer(palette = "Set3")
  
  
print(p)
p <- p + geom_bar(stat="identity", color='black') 
p <- p +
  theme(axis.ticks=element_blank(),  # the axis ticks
        axis.title=element_blank(),  # the axis labels
        axis.text=element_blank(),
        panel.grid = element_blank())
print(p)
p<-p + 
  xlab('')+
  ylab('')
print(p)

F1B = ggplot(data=temp_relFreq) + 
  #geom_histogram(mapping=aes(x=factor(1),y=percent,fill=CurrentPosition),
  #geom_histogram(mapping=aes(x=factor(1),y=percent,fill=CurrentPosition),
  geom_histogram(mapping=aes(x=factor(1),y=percent,fill=CurrentPosition),
  #geom_histogram(mapping=aes(x=reorder(CurrentPosition,ordering),y=percent,fill=CurrentPosition),
                 stat = 'identity',
                 width = 1) +
  #coord_polar(theta = "y",start=0, clip = "on") +
  coord_polar(theta = "y") +
  #scale_x_continuous(limits = c(0,360)) +
  xlab('') + ylab('') + ggtitle(paste0('Current Position (n = ',sum(temp_relFreq$percent),'%)')) +
  theme(text = element_text(size=7), axis.ticks = element_blank(),axis.text = element_blank(),panel.grid  = element_blank(),legend.position = "none") +
  geom_text_repel(data=df_label1,aes(x=factor(1),y = lab_ypos, label = lab_label), colour = "black", box.padding = 0.5,point.size = NA,nudge_x = 0, size = 5.8/.pt) +
  geom_text_repel(data=df_label2,aes(x=factor(1),y = lab_ypos, label = lab_label), colour = "black", box.padding = 0.5,point.size = NA,nudge_x = 1, size = 5.8/.pt) +
  scale_fill_brewer(palette = "Set3") #mycol) #Set3



F1B
#F1B <- p
sprintf("Absolute und relative Häufigkeiten der %s", "Current Position")
print(temp_relFreq, quote = TRUE, row.names = FALSE)
#png('CurrentPosition.png', width = 30, height = 20, units = "cm", res = 300)
ragg::agg_tiff("Fig1B.tiff", width = 8, height = 8, units = "cm", res = 600, scaling = 1)

F1B
dev.off()



# make a Neuro Discipline plot
# Neuro Discipline questions are Yes/No questions ==> just need the ones who answered with Yes
temp = data %>% 
  select(Response.ID,NeuroDisciplineCat,NeuroDiscipline) %>% 
  filter(NeuroDiscipline != 'No') %>%
  na.omit() %>% 
  unique() %>% 
  group_by(NeuroDiscipline) %>% 
  filter(n() >= 3) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"imaging", "Imaging")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"behavioral", "Behav.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"neuroscience", "Neurosci.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Neuroscience", "Neurosci.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. electrophysiological recording behavior tracking ", "")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g.  patient involvement clinical trials ", "")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"science", "Science")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g.  modeling simulation ", "")) 
  

# calc abs numbers to make more own plots
temp_absNumbers = temp %>% 
  group_by_at(vars(-Response.ID)) %>% 
  summarise(n = n()) %>%
  mutate(percent = round(n / sum(temp_relFreq$n)*100)) %>%
  arrange(desc(percent))

#temp_relFreq2 = temp %>% 
#  group_by_at(vars(-Response.ID, -NeuroDiscipline)) %>% 
#  summarise(n = n()) %>% 
#  mutate(percent = n / 218) %>%
#  arrange(desc(percent))


#print(temp_relFreq2, quote = TRUE, row.names = FALSE)

F1A = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=reorder(NeuroDisciplineCat,percent),y=percent),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  #xlab('') + ylab('') + ggtitle(paste0('Neuro Disciplines \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) +
  xlab('') + ylab('') + ggtitle(paste0('Neuro Disciplines (n = ',sum(temp_absNumbers$n), ')')) +
  #lab('') + ylab('') + ggtitle(paste0('Neuro Disciplines \n(n = ',sum(temp_absNumbers$n),)) +
  geom_text(aes(x=NeuroDisciplineCat,y = percent, label = paste0(" ", percent,'%')), colour = "white",hjust=1.2, size = 8/.pt) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20))   +
  #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels
  theme(text = element_text(size=8)) 

F1A
ragg::agg_tiff("Fig1A.tiff", width = 8, height = 8, units = "cm", res = 600, scaling = 1)
F1A
#plt

dev.off()




# plot both graphs into one figure
F1<-plot_grid(F1A,F1B,nrow = 1,ncol = 2,align = "h",axis = "lr",scale = 1,rel_widths = c(1,1))
F1
ragg::agg_tiff("Fig1.tiff", width = 17.5, height = 8, units = "cm", res = 600, scaling = 1)
#tiff('Fig1_CurrentntPosition_AND_NeuroDiscipline.tiff', width = 30, height = 20, units = "cm", res = 300)
#png('CurrentntPositionNeuroDiszipline.png', width = 30, height = 20, units = "cm", res = 300)
F1
dev.off()



####################################################
#### Figure 2 ######################################
###############
#### General Data Sharing ##########################

data0 = neuro_data %>% dplyr::select(Response.ID,
                                     I.work.at...I.am.affiliated.with.,
                                     My.current..primary..position.is.,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('Do.you.have.existing.data.sets.'),
                                     starts_with('Which.neuroscience.discipline.s.'),
                                     starts_with('Please.state.if.your.')
)


colnames(data0)[which(names(data0) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
colnames(data0)[which(names(data0) == 'I.work.at...I.am.affiliated.with.')] <- "Work_Place"
colnames(data0)[which(names(data0) == 'My.current..primary..position.is.')] <- 'CurrentPosition'

comb_string_vec = c('Which.neuroscience.discipline.s.',
                    'Please.state.if.your.',
                    'Have.you.shared.data.with')

comb_col_names = c('NeuroDiscipline',
                   'FimilarDataTypes',
                   'DataSharing')

# Diese Schleife ist mit Vorsicht zu genieÃŸen. Entstehende long format Datensatz kann sehr groÃŸ werden
library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0


temp2 = neuro_data %>% select(Response.ID, Have.you.shared.data.with.....External.collaborators.) %>% 
  na.omit()
  

###
# DAta Sharing
#temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes) %>% filter(FimilarDataTypes != 'No') %>%
#  na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 3)
temp = data %>% select(Response.ID,DataSharingCat,DataSharing) %>% 
  filter(DataSharing != 'No') %>%
  na.omit() %>% 
  unique() %>% 
  group_by(DataSharing) %>% 
  filter(n() >= 3)
#temp = data %>% select(Response.ID,DataSharingCat,DataSharing)  %>% filter()
#  na.omit() %>% unique() %>% group_by(DataSharing) %>% filter(n() >= 3)

# calc abs numbers to make more own plots
temp_absNumbers = temp %>% 
  group_by_at(vars(-Response.ID)) %>% 
  summarise(n = n()) %>%
  mutate(percent = round(n/144*100,0))


temp_relFreq = temp %>% group_by_at(vars(-Response.ID, -DataSharing)) %>% summarise(n = n()) %>% mutate(percent = n / 144)

p = ggplot(data=temp_absNumbers) + 
  #geom_histogram(mapping=aes(x=reorder(TaskStandardToolsCat, percent),y= percent),
  geom_histogram(mapping=aes(x=reorder(DataSharingCat, percent) ,y=percent),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  theme(text = element_text(size=11)) +
  #theme(plot.margin = unit(c(0.5,0.2,0.2,5), "cm")) +
  xlab('') + ylab('') + ggtitle(paste0('Datasharing') ) +
  geom_text(aes(x=DataSharingCat,y = percent, label = paste0(percent,'%')), colour = "white",hjust=1.2) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20))   
  #scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) + # nice regular expression solution for multiple lined labels
   

# die theme text groesse ist unabhaengig von der Hoehe der Grafik sie bezieht sich auf die resolution 
# 600 pro cm 
#In the theme, the size is defined in pts. So here 15, means 15 pts. In geom_text, the size is defined in mm, so it's 15 mm.
#
#What is the relation between pts and mm or in ? If we want exactly the same size for the title and the text in the plot, how can we define it ? It needs some conversion :
#  
#  1 pt = 1/72 in
#1 pt = 0.35 mm
#So if we want the text to be the same size as the title, the size in mm will be 15 pt * 0.35 pt/mm = 5.25 mm
#
#In ggplot, there is a constant defined to make the conversion, .pt = 2.845276. (1/.pt = 0.35). You can type in .pt in the console and it will display its value :
#  
#  ggplot2::.pt
## [1] 2.845276
#So to make the conversion :
#  
#  from pt to mm : mm = pt / .pt -> 15 / 2.845276 = 5.27
#from mm to pt : pt = mm * .pt -> 5.27 * 2.845276 = 15
#Let's change the size of the geom_text to be the same of the title by using size = 15/.pt :
#  
#  plt <- penguins %>%
#  ggplot(aes(bill_length_mm, bill_depth_mm, color = species)) +
#  geom_point()+
#  geom_text(x = 45, y = 20, label = "Example of font problem", size = 15/.pt, inherit.aes = FALSE) +
#  labs(title = "Bill length and depth relation by species") +
#  theme(plot.title = element_text(size = 15))

# Affinity designer sagt 2 cm margin ... 
p
#tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600)
#pFD
ragg::agg_tiff("Fig2_DataSharing3.tiff", width = 17.5, height = 6, units = "cm", res = 600, scaling = 1)

p
#plt

dev.off()





#####################################
####
# Reusing data of others
#####################################

neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Have.you.shared.data.with.....Publicly.
                                              )
#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# wieviele haben keine Daten
temp_absNumbers = neuro_data_tmp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n())
temp_absNumbers = neuro_data_tmp %>% 
  na.omit %>% 
  filter(!Existing_Data == 'I have no datasets') %>%
  group_by_at(vars(Existing_Data)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat("Von den Antwortenden die mindestens einen Datensatz haben ... wieviel Prozent haben diesen verfuegbar gemacht")
cat(temp_absNumbers$n[3], '(', round(temp_absNumbers$percent[3],3)*100, '%) haben diese DAten verfuegbar gemacht')

print(temp_absNumbers, quote = TRUE, row.names = FALSE)


temp_absNumbers = neuro_data_tmp %>% 
  na.omit %>% 
  filter(!Existing_Data == 'I have no datasets') %>%
  group_by_at(vars(Other_can_answer)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat("Von den Antwortenden die mindestens einen Datensatz haben ... ")
cat( temp_absNumbers$n[3], '(', round(temp_absNumbers$percent[3],3)*100, '%) , of all respondents that have at least one dataset are of the opinion that other researchers could answer their own research questions by re-using data from their research.') 

temp_absNumbers = neuro_data_tmp %>% 
  na.omit %>% 
  #filter(!Existing_Data == 'I have no datasets') %>%
  filter(Other_can_answer=='Yes') %>%
  group_by_at(vars(Shared_Publicly)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

S1 = 'However, even for this subgroup, of scientists in possession of data of which they think are valuable to others '
S2 = '% have never shared any of their data publicly.'
cat(S1, round(temp_absNumbers$percent[1],3)*100, S2)




##########################################################
##########################################################

# Research data management skills are essential for preparing, analyzing, and publicly sharing data. 
# Only 18% think that they have proficiency in research data management and only 34% think that they 
# know which research data management methods are available. 
# Interestingly, 58% of all respondents nevertheless think that they can handle their research data 
# according to community standards. This could be due to the availability of data research managers 
# who help in data handling. 
# However, only 25 (20%) of participants have dedicated personnel with research data management 
# or data curation expertise. 




neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.,
                                              What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.,
                                              Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.,
                                              What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.,
                                              What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.,
                                              What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.,
                                              Have.you.shared.data.with.....Publicly.
)
#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# I_know_how_to_publish_my_data_reproducible
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible'
# I_have_RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal'
# I_can_handle_RD_community_standards
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards'
# I_have_proficiency_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM'
# Iam_highly_knowledgeable_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM'
# I_know_RDM_available_Methods
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods'
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''

S1 = 'Research data management skills are essential for preparing, analyzing, and publicly sharing data. /n Only '
S2 = 'think that they have proficiency in research data management and only '
S3 = 'think that they know which research data management methods are available. /n Interestingly, ' 
S4 = 'of all respondents nevertheless think that they can handle their research data according to community standards. This could be due to the availability of data research managers who help in data handling. However, only '
S5 = 'of participants have dedicated personnel with research data management or data curation expertise.'


cat(S1)
agree = c('Fully agree', 'Rather agree')
disagree = c('Fully disagree', 'Rather disagree')
not_agree = c('Fully agree', 'Rather agree', 'Undecided')
not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided')

df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " agree"), "Agree")) %>% 
  mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " disagree"), "Disagree")) %>% 
  group_by_at(vars(I_have_proficiency_in_RDM)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_have_proficiency_in_RDM=='Agree'], '(', round(df_np$percent[df_np$I_have_proficiency_in_RDM=='Agree'],3)*100, '%)')
cat(S2)


df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " agree"), "Agree")) %>% 
  mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " disagree"), "Disagree")) %>% 
  group_by_at(vars(I_know_RDM_available_Methods)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_know_RDM_available_Methods=='Agree'], '(', round(df_np$percent[df_np$I_know_RDM_available_Methods=='Agree'],3)*100, '%)')
cat(S3)


df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " agree"), "Agree")) %>% 
  mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " disagree"), "Disagree")) %>% 
  group_by_at(vars(I_can_handle_RD_community_standards)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_can_handle_RD_community_standards=='Agree'], '(', round(df_np$percent[df_np$I_can_handle_RD_community_standards=='Agree'],3)*100, '%)')
cat(S4)



df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  group_by_at(vars(I_have_RDM_personal)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_have_RDM_personal=='Yes, in my lab'], '(', round(df_np$percent[df_np$I_have_RDM_personal=='Yes, in my lab'],3)*100, '%)')
cat(S5)




###########################################################
###########################################################
# Use of tools and standards 
###########################################################
#
# We inquired about the use of existing tools and standards for different research 
# data management activities, if this process step was relevant for the participants. 

neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.,
                                              What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.,
                                              Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.,
                                              What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.,
                                              What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.,
                                              What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.,
                                              Have.you.shared.data.with.....Publicly.,
                                              starts_with('For.which.of.these.tasks.'),
                                              starts_with('To.what.degree.do.you.')
)
#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# I_know_how_to_publish_my_data_reproducible
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible'
# I_have_RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal'
# I_can_handle_RD_community_standards
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards'
# I_have_proficiency_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM'
# Iam_highly_knowledgeable_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM'
# I_know_RDM_available_Methods
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods'
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''

comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.')
comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree')

data0 <- neuro_data_tmp
library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0



S1 = 'We inquired about the use of existing tools and standards for different research data man-agement activities, if this process step was relevant for the participants. '

cat(S1)
agree = c('Fully agree', 'Rather agree')
disagree = c('Fully disagree', 'Rather disagree')
not_agree = c('Fully agree', 'Rather agree', 'Undecided')
not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided')

df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " agree"), "Agree")) %>% 
  mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " disagree"), "Disagree")) %>% 
  group_by_at(vars(I_have_proficiency_in_RDM)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_have_proficiency_in_RDM=='Agree'], '(', round(df_np$percent[df_np$I_have_proficiency_in_RDM=='Agree'],3)*100, '%)')
cat(S2)


df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " agree"), "Agree")) %>% 
  mutate(I_know_RDM_available_Methods = replace(I_know_RDM_available_Methods, str_detect(I_know_RDM_available_Methods, " disagree"), "Disagree")) %>% 
  group_by_at(vars(I_know_RDM_available_Methods)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_know_RDM_available_Methods=='Agree'], '(', round(df_np$percent[df_np$I_know_RDM_available_Methods=='Agree'],3)*100, '%)')
cat(S3)


df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " agree"), "Agree")) %>% 
  mutate(I_can_handle_RD_community_standards = replace(I_can_handle_RD_community_standards, str_detect(I_can_handle_RD_community_standards, " disagree"), "Disagree")) %>% 
  group_by_at(vars(I_can_handle_RD_community_standards)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_can_handle_RD_community_standards=='Agree'], '(', round(df_np$percent[df_np$I_can_handle_RD_community_standards=='Agree'],3)*100, '%)')
cat(S4)



df_np = neuro_data_tmp %>% 
  #select(I_have_proficiency_in_RDM) %>%
  group_by_at(vars(I_have_RDM_personal)) %>% 
  na.omit %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat(df_np$n[df_np$I_have_RDM_personal=='Yes, in my lab'], '(', round(df_np$percent[df_np$I_have_RDM_personal=='Yes, in my lab'],3)*100, '%)')
cat(S5)




######################################################################


####################################################
#### Figure 3 ######################################
###############
#### Tools ##########################



# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
temp = data %>% 
  select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% 
  filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  
  unique() %>% 
  droplevels()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / 159)



###
# Use of standard Tools

temp_absNumbers = temp %>% 
  group_by_at(vars(TaskStandardToolsCat, TaskStandardTools)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n /sum(n)*100,0)) %>%
  #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  filter(TaskStandardTools != 'No') %>%
  filter(TaskStandardToolsCat != ' Simulation ') %>%
  arrange(percent)





pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=reorder(TaskStandardToolsCat, percent),y= percent),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"),
  #      axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
  #      axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"),
  #      axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) +
  theme(text = element_text(size=11)) +
  xlab('') + ylab('') + ggtitle(paste0('Use of Standard Tools for ...')) +
  geom_text(aes(x=TaskStandardToolsCat,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + 
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20))  
#scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels



pFD
#tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600)
#pFD
ragg::agg_tiff("Fig3.tiff", width = 17.5, height = 10, units = "cm", res = 600, scaling = 1)
pFD
#plt

dev.off()




######################################################################
######################################################################


####################################################
#### Figure 4 ######################################
###############
#### Tools vs. DataSharing #########################


# recreate different datasets
# more elegant
neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.,
                                              What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.,
                                              Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.,
                                              What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.,
                                              What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.,
                                              What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.,
                                              Have.you.shared.data.with.....Publicly.,
                                              starts_with('For.which.of.these.tasks.'),
                                              starts_with('To.what.degree.do.you.'),
                                              starts_with('Think.of.re.using.data.'),
                                              starts_with('Think.of.data.sharing.')
)

#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# I_know_how_to_publish_my_data_reproducible
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible'
# I_have_RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal'
# I_can_handle_RD_community_standards
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards'
# I_have_proficiency_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM'
# Iam_highly_knowledgeable_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM'
# I_know_RDM_available_Methods
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods'
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''



comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Think.of.data.sharing.')

comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'ThinkSharingData')



library(data.table)
data0 = neuro_data_tmp
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
temp = data %>% 
  #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% 
  select(Response.ID,Shared_Publicly, TaskStandardToolsCat, TaskStandardTools) %>% 
  filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  filter( TaskStandardToolsCat != "No") %>%
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  
  unique() %>% 
  droplevels()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% 
  group_by_at(vars(-Response.ID, Shared_Publicly,-TaskStandardToolsCat)) %>% 
  summarise(n = n()) %>% 
  mutate(share = n / 159)

no = temp_relFreq %>% 
  filter(Shared_Publicly=="No" ) %>% 
  mutate(procent = n/sum(n)*100)
yes = temp_relFreq %>% 
  filter(Shared_Publicly=="Yes" ) %>% 
  mutate(procent = n/sum(n)*100)
cat("Althoug the use of standard tools are in very different areas  there is")
cat("the trend that those how generally use more standard tools are more likely to share their data.")
cat("In the group that did not share their data publicly only")
cat(no$procent[2], "% use standard Tools. While in the group who shares data ")
cat(yes$procent[2], "% use standard Tools. ")
cat("A possible explanation could be that scientists who work a lot with standard tools find it easier to overcome the heavily standardized rules of public sharing of data. ")
cat("Formally, of course, it cannot be excluded that the dominant causality is opposite. However, we consider it unlikely that the motivation to share data is the main driver for a general affinity to use standard methods. ")






###
# Use of standard Tools Degree

temp = data %>% 
  #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% 
  select(Response.ID,Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree) %>% 
  filter(!grepl('Comment|Other',TaskStandardToolsDegreeCat)) %>% 
  filter( TaskStandardToolsDegreeCat != "No") %>%
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  
  unique() %>% 
  droplevels()

temp_absNumbers = 0
temp_absNumbers = temp %>% 
  group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n /sum(n)*100,0)) %>%
  #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>%
  filter(TaskStandardToolsDegree == 'Mostly') %>%
  #  filter(Shared_Publicly == "Yes") %>%
  filter(TaskStandardToolsDegreeCat != ' Simulation ') %>%
  group_by_at(vars(Shared_Publicly))
#  arrange(percent)

temp_absNumbers_all = temp %>% 
  group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n /sum(n)*100,0)) %>%
  #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>%
  #filter(TaskStandardToolsDegree == 'Mostly') %>%
  #  filter(Shared_Publicly == "Yes") %>%
  filter(TaskStandardToolsDegreeCat != ' Simulation ') %>%
  group_by_at(vars(Shared_Publicly))
#  arrange(percent)




yes = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="Yes"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="Yes"])*100
no = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="No"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="No"])*100
cat(yes, " % answer mostly if they share their data while only ")
cat(no, " % using mostly standard methods for their work if they did not share their data openly")
cat("Respondents who share their data publicly have a " , (yes-no)/no*100,  "% higher rate to using 'mostly' standard tools in their daily work")




####################################################
####          ######################################
###############
#### Obstacles for Data sharing DataSharing #########################



data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('Please.indicate.'),
                                     starts_with('How.do.you.process.and.analyze.your.data.'),
)

comb_string_vec = c('Please.indicate.',
                    'How.do.you.process.and.analyze.your.data.',
                    'Have.you.shared.data.with')
comb_col_names = c('SharingProblems',
                   'HowAnalyzeData',
                   'DataSharing')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0




S1 = 'We inquired about the use of existing tools and standards for different research data man-agement activities, if this process step was relevant for the participants. '

cat(S1)
agree = c('Fully agree', 'Rather agree')
disagree = c('Fully disagree', 'Rather disagree')
not_agree = c('Fully agree', 'Rather agree', 'Undecided')
not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided')

temp = data %>% 
  select(Response.ID,SharingProblemsCat,SharingProblems) %>%
  mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " agree"), "Agree")) %>% 
  mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " disagree"), "Disagree")) %>% 
  na.omit %>%
  group_by_at(vars(SharingProblemsCat, SharingProblems)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n / sum(n)*100 ,0))

ownership = 0
ownership <- temp %>%
  filter(SharingProblemsCat == " I do not want to use a public repository because my data ownership  intellectual property might be violated ") #%>%
institution <- temp %>%
  filter(SharingProblemsCat == " My institutional policy allows to upload data to a public repository ") #%>%

legal <- temp %>%
  filter(SharingProblemsCat == ' Legal aspects licensing national laws    are significant hurdles for public repository usage ')

rights <- temp %>%
  filter(SharingProblemsCat == ' For my research project s  I am unsure if I own the rights to upload the data to a public repository ')

sufficient_guidance<- temp%>%
  filter(SharingProblemsCat == ' There is sufficient guidance towards choosing an appropriate repository for my data ')

time <- temp %>%
  filter(SharingProblemsCat == ' There is a lack of time to deposit data in a repository ')

expertise <- temp %>%
  filter(SharingProblemsCat == ' There is a lack of expertise and human resources to deposit data in a repository ')

technic <- temp %>%
  filter(SharingProblemsCat == ' Technical hurdles are too high to upload to a repository large data transfer lack of requested metadata    ')

#their_way <- temp %>%
#  filter(SharingProblemsCat == ' For my research project s  I am unsure if I own the rights to upload the data to a public repository ')
  #filter(SharingProblems == "Agree") #%>%
  #$select(percent)




 # temp$n[temp$SharingProblemsCat==" I do not want to use a public repository because my data ownership  intellectual property might be violated " && SharingProblems == "Agree"]
#First of all, we did not find major general opposition to public data sharing. 
cat( ownership$percent[ownership$SharingProblems=="Agree"] ,'% are reluctant to share data publicly because the data ownership or intellectual property might be violated (vs. ', ownership$percent[ownership$SharingProblems=="Disagree"] , ').') 
cat('Interestingly, ', institution$percent[institution$SharingProblems=="Undecided"], '% participants did not know whether their institutional policy allow to up-load data to a public repository')
cat(' while further ', institution$percent[institution$SharingProblems=="Disargree"],'% are sure that they did not' )
cat(institution$percent[institution$SharingProblems=="Disargree"])
cat('Further ', 100-rights$percent[rights$SharingProblems=="Disagree"], ' are not sure whether they own the rights to upload data from their own research project')
cat(legal$percent[legal$SharingProblems=="Agree"], ' %) see legal aspects as significant hurdles for public repository usage.')
cat('These answers indicate major uncertainties with regard to legal issues.')

cat('Only ', sufficient_guidance$percent[sufficient_guidance$SharingProblems=="Agree"], '% think that there is sufficient guidance towards choosing an appropriate repository for my data')
cat(time$percent[time$SharingProblems=="Agree"], '% think that there is a lack of time to deposit data in a repository.')
cat('while only ',time$percent[time$SharingProblems=="Disagree"], '% disagree on this point')
cat(expertise$percent[expertise$SharingProblems=="Agree"], "% think that there is a lack of expertise and human resources to deposit data in a repository")
cat(technic$percent[technic$SharingProblems=="Agree"], "% think that the technical hurdles are too high to uplad tdat ato a repository")

cat('83% of respondents did not think that their research data must be handled in their very own, individual way. The lack of professional data management was reported as problem. 70 (54%) participants think that they would share more of their data if they had better data man-agement while only 32% think that a better data management would not increase the amount of own data to share. Due to the lack of professional data management, the preparation of an dataset for public use is a time-consuming process. 70% of those respondents how have previously prepared data for publication and re-use say that the time that they need to ready a dataset requires more than a day while 39% need even more than a week.   Accordingly, 76 (60%) think that there is a lack of time to deposit data in a repository while only 31 (24%) did not think that time is a problem for the deposition of data in a public repository.')




######################################################################
######################################################################



# recreate different datasets
# more elegant
neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.,
                                              What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.,
                                              Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.,
                                              What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.,
                                              What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.,
                                              What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.,
                                              Have.you.shared.data.with.....Publicly.,
                                              My.current..primary..position.is.,
                                              starts_with('For.which.of.these.tasks.'),
                                              starts_with('To.what.degree.do.you.'),
                                              starts_with('Think.of.re.using.data.'),
                                              starts_with('Which.neuroscience.discipline.s.'),
                                              starts_with('Think.of.data.sharing.')
                                              
)

#CurrentPosition

colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'My.current..primary..position.is.')] <- "CurrentPosition"
#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# I_know_how_to_publish_my_data_reproducible
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible'
# I_have_RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal'
# I_can_handle_RD_community_standards
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards'
# I_have_proficiency_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM'
# Iam_highly_knowledgeable_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM'
# I_know_RDM_available_Methods
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods'
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''




comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Which.neuroscience.discipline.s.',
                    'Think.of.data.sharing.')

comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'NeuroDiscipline',
                   'ThinkSharingData')



library(data.table)
data0 = neuro_data_tmp
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
temp = data %>% 
  #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% 
  select(Response.ID,Shared_Publicly,  CurrentPosition) %>% 
  #filter(!grepl('Comment|Other',CurrentPositionCat)) %>% 
  #filter( CurrentPositionCat != "No") %>%
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  #na.omit() %>%  
  unique() #%>% 
  #droplevels()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% 
  group_by_at(vars(-Response.ID, Shared_Publicly)) %>% 
  summarise(n = n()) %>% 
  #mutate(share = n / 159) %>%
  filter(n >= 3) %>%
# calc abs numbers to make more own plots
  na.omit() 
  
library(reshape2)


# here Plot mit 2 Farben 
#############################
# hier weiter machen 
# alle gleiche Current Position auf 100 und dann den Anteil die Daten Teilen als %

#data2 <- reshape(temp_relFreq, idvar = "CurrentPosition", timevar = "Shared_Publicly", direction = "wide")
temp_absNumbers <- dcast(temp_relFreq, CurrentPosition ~ Shared_Publicly) %>%
  mutate(percent = round(Yes/ (No + Yes)*100,0)) %>%
  na.omit() %>%
  arrange(desc(percent)) %>%
  mutate(CurrentPosition = str_replace(CurrentPosition,"Research data management focused staff", "RDM staff"))



pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=reorder(CurrentPosition, percent),y= percent),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"),
  #      axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
  #      axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"),
  #      axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) +
  theme(text = element_text(size=11)) +
  xlab('') + ylab('') + ggtitle(paste0('Datasharing for different scientific positions')) +
  geom_text(aes(x=CurrentPosition,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + 
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20))  
#scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels



pFD
#tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600)
#pFD
ragg::agg_tiff("Fig5_Position_vs_Sharing.tiff", width = 17.5, height = 6, units = "cm", res = 600, scaling = 1)
pFD
#plt

dev.off()




temp = data %>% 
  #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% 
  select(Response.ID,Shared_Publicly, NeuroDisciplineCat,  NeuroDiscipline) %>% 
  filter(NeuroDiscipline == "Yes") %>%
  #filter(!grepl('Comment|Other',CurrentPositionCat)) %>% 
  #filter( CurrentPositionCat != "No") %>%
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  select(-NeuroDiscipline) %>%
  na.omit() %>%  
  unique() #%>% 
#droplevels()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% 
  group_by_at(vars(-Response.ID, Shared_Publicly)) %>% 
  summarise(n = n()) %>% 
  #mutate(share = n / 159) %>%
  filter(n >= 3) %>%
  # calc abs numbers to make more own plots
  na.omit() 

library(reshape2)


# here Plot mit 2 Farben 
#############################
# hier weiter machen 
# alle gleiche Current Position auf 100 und dann den Anteil die Daten Teilen als %

#data2 <- reshape(temp_relFreq, idvar = "CurrentPosition", timevar = "Shared_Publicly", direction = "wide")
temp_absNumbers <- dcast(temp_relFreq, NeuroDisciplineCat ~ Shared_Publicly) %>%
  mutate(percent = round(Yes/ (No + Yes)*100,0)) %>%
  na.omit() %>%
  arrange(desc(percent)) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"imaging", "Imaging")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Theoretical", "Theoret.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"behavioral", "Behav.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"neuroscience", "Neurosci.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"Neuroscience", "Neurosci.")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g. electrophysiological recording behavior tracking ", "")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g.  patient involvement clinical trials ", "")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"science", "Science")) %>%
  mutate(NeuroDisciplineCat = str_replace(NeuroDisciplineCat,"e.g.  modeling simulation ", "")) 





pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=reorder(NeuroDisciplineCat, percent),y= percent),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"),
  #      axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
  #      axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"),
  #      axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) +
  theme(text = element_text(size=11)) +
  xlab('') + ylab('') + ggtitle(paste0('Datasharing for different neuroscientific subdisciplines')) +
  geom_text(aes(x=NeuroDisciplineCat,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + 
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 40))  
#scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels



pFD
#tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600)
#pFD
ragg::agg_tiff("Fig6_Discipline_vs_Sharing.tiff", width = 17.5, height = 10, units = "cm", res = 600, scaling = 1)
pFD
#plt

dev.off()





























no = temp_relFreq %>% 
  filter(Shared_Publicly=="No" ) %>% 
  mutate(procent = n/sum(n)*100)
yes = temp_relFreq %>% 
  filter(Shared_Publicly=="Yes" ) %>% 
  mutate(procent = n/sum(n)*100)



cat("Althoug the use of standard tools are in very different areas  there is")
cat("the trend that those how generally use more standard tools are more likely to share their data.")
cat("In the group that did not share their data publicly only")
cat(no$procent[2], "% use standard Tools. While in the group who shares data ")
cat(yes$procent[2], "% use standard Tools. ")
cat("A possible explanation could be that scientists who work a lot with standard tools find it easier to overcome the heavily standardized rules of public sharing of data. ")
cat("Formally, of course, it cannot be excluded that the dominant causality is opposite. However, we consider it unlikely that the motivation to share data is the main driver for a general affinity to use standard methods. ")






###
# Use of standard Tools Degree

temp = data %>% 
  #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% 
  select(Response.ID,Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree) %>% 
  filter(!grepl('Comment|Other',TaskStandardToolsDegreeCat)) %>% 
  filter( TaskStandardToolsDegreeCat != "No") %>%
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  
  unique() %>% 
  droplevels()

temp_absNumbers = 0
temp_absNumbers = temp %>% 
  group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n /sum(n)*100,0)) %>%
  #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>%
  filter(TaskStandardToolsDegree == 'Mostly') %>%
  #  filter(Shared_Publicly == "Yes") %>%
  filter(TaskStandardToolsDegreeCat != ' Simulation ') %>%
  group_by_at(vars(Shared_Publicly))
#  arrange(percent)

temp_absNumbers_all = temp %>% 
  group_by_at(vars(Shared_Publicly, TaskStandardToolsDegreeCat, TaskStandardToolsDegree)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = round(n /sum(n)*100,0)) %>%
  #filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardToolsDegree == 'Mostly' | TaskStandardToolsDegree == "As much as possible") %>%
  #filter(TaskStandardToolsDegree == 'Mostly') %>%
  #  filter(Shared_Publicly == "Yes") %>%
  filter(TaskStandardToolsDegreeCat != ' Simulation ') %>%
  group_by_at(vars(Shared_Publicly))
#  arrange(percent)




yes = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="Yes"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="Yes"])*100
no = sum(temp_absNumbers$n[temp_absNumbers$Shared_Publicly=="No"])/ sum(temp_absNumbers_all$n[temp_absNumbers$Shared_Publicly=="No"])*100
cat(yes, " % answer mostly if they share their data while only ")
cat(no, " % using mostly standard methods for their work if they did not share their data openly")
cat("Respondents who share their data publicly have a " , (yes-no)/no*100,  "% higher rate to using 'mostly' standard tools in their daily work")



######################################################################
######################################################################
# Zusammenhangsanalyse mit dem Data Sharing ... was hat einen Einfluss auf data Sharing


# recreate different datasets
# more elegant
neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.,
                                              What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.,
                                              Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.,
                                              How.much.time.do.you.currently.need.to.ready.a.dataset.from.your.lab.for.publication.and.re.use.,
                                              Have.you.shared.data.with.....Publicly.,
                                              starts_with('For.which.of.these.tasks.'),
                                              starts_with('To.what.degree.do.you.'),
                                              starts_with('Think.of.re.using.data.'),
                                              starts_with('Think.of.data.sharing.')
)

#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# I_know_how_to_publish_my_data_reproducible
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible'
# I_have_RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal'
# I_can_handle_RD_community_standards
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards'
# I_have_proficiency_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM'
# Iam_highly_knowledgeable_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM'
# I_know_RDM_available_Methods
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods'
# how_much_time
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'How.much.time.do.you.currently.need.to.ready.a.dataset.from.your.lab.for.publication.and.re.use.')] <- 'how_much_time'




comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Think.of.data.sharing.')

comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'ThinkSharingData')



library(data.table)
data0 = neuro_data_tmp
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0



agree = c('Fully agree', 'Rather agree')
disagree = c('Fully disagree', 'Rather disagree')
not_agree = c('Fully agree', 'Rather agree', 'Undecided')
not_disagree = c('Fully disagree', 'Rather disagree', 'Undecided')

temp_absNumbers = data %>% 
  select(Response.ID, how_much_time) %>%
  #select(I_have_proficiency_in_RDM) %>%
  #  mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " agree"), "Agree")) %>% 
  #  mutate(I_have_proficiency_in_RDM = replace(I_have_proficiency_in_RDM, str_detect(I_have_proficiency_in_RDM, " disagree"), "Disagree")) %>% 
  group_by_at(vars(how_much_time)) %>% 
  na.omit %>% 
  unique() %>% 
  #  droplevels() %>%
  summarise(n = n()) %>% 
  mutate(percent = round(n / sum(n)*100,0)) %>%
  arrange(percent) 



pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=reorder(how_much_time, percent),y= percent),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  #theme(axis.text.x = element_text(color = "grey20", size = 10, angle = 90, hjust = .5, vjust = .5, face = "plain"),
  #      axis.text.y = element_text(color = "grey20", size = 10, angle = 0, hjust = 1, vjust = 0, face = "plain"),  
  #      axis.title.x = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = 0, face = "plain"),
  #      axis.title.y = element_text(color = "grey20", size = 12, angle = 0, hjust = .5, vjust = .5, face = "plain")) +
  theme(text = element_text(size=11)) +
  xlab('') + ylab('') + ggtitle(paste0('Time needed to ready a dataset for publication and reuse')) +
  geom_text(aes(x=how_much_time,y = percent, label = paste0(percent,"%")), colour = "white",hjust=1.5) + 
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, " " , " "), width = 20))  
#scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels



pFD
#tiff('Fig3_Use_Standard_Tools.tiff', width = 17.5, height = 8, units = "cm", res = 600)
#pFD
ragg::agg_tiff("Fig4_Time.tiff", width = 17.5, height = 7, units = "cm", res = 600, scaling = 1)
pFD
#plt

dev.off()


######################################################################
######################################################################
######################################################################
######################################################################
######################################################################
######################################################################

# What is the factor that most strongly seperates shareers from non-sharers
# Try something



# recreate different datasets
# more elegant
neuro_data_tmp = 0
neuro_data_tmp = neuro_data %>% dplyr::select(Response.ID,
                                              Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..,
                                              Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.,
                                              Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.,
                                              What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.,
                                              Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.,
                                              What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.,
                                              What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.,
                                              What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.,
                                              Have.you.shared.data.with.....Publicly.,
                                              My.current..primary..position.is.,
                                              starts_with('For.which.of.these.tasks.'),
                                              starts_with('To.what.degree.do.you.'),
                                              starts_with('Think.of.re.using.data.'),
                                              starts_with('Which.neuroscience.discipline.s.'),
                                              starts_with('Applying.research.data.management..'),
                                              starts_with('Think.of.data.sharing.')
                                              
)

#CurrentPosition

colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'My.current..primary..position.is.')] <- "CurrentPosition"
#Existing_Data
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse.')] <- "Existing_Data"
# Other_can_answer
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..')] <- "Other_can_answer"
# Shared_Publicly
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Have.you.shared.data.with.....Publicly.')] <- "Shared_Publicly"
# I_know_how_to_publish_my_data_reproducible
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Please.indicate...I.know.how.to.publish.my.data.analysis.workflows.in.a.reproducible.manner.')]<- 'I_know_how_to_publish_my_data_reproducible'
# I_have_RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'I_have_RDM_personal'
# I_can_handle_RD_community_standards
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.can.handle.my.research.data.according.to.community.standards.')] <- 'I_can_handle_RD_community_standards'
# I_have_proficiency_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.have.proficiency.in.research.data.management.')] <- 'I_have_proficiency_in_RDM'
# Iam_highly_knowledgeable_in_RDM
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...Overall..I.am.highly.knowledgeable.about.research.data.management.in.my.research.field.')] <- 'Iam_highly_knowledgeable_in_RDM'
# I_know_RDM_available_Methods
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'What.is.your.opinion.on.the.following.statements...I.know.well.which.research.data.management.methods.are.available.')] <- 'I_know_RDM_available_Methods'
# RDM_personal
colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == 'Do.you.have.dedicated.personnel.with.research.data.management.or.data.curation.expertise.')] <- 'RDM_personal'
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''
#colnames(neuro_data_tmp)[which(names(neuro_data_tmp) == '')] <- ''




comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Which.neuroscience.discipline.s.',
                    'Think.of.data.sharing.')

comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'NeuroDiscipline',
                   'ThinkSharingData')



library(data.table)
data0 = neuro_data_tmp
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

#
# I_can_handle_RD_community_standards
# I_have_proficiency_in_RDM
# Iam_highly_knowledgeable_in_RDM
# I_know_RDM_available_Methods


# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?

# remove Comment columes
temp = data %>% 
  #select(Response.ID,TaskStandardToolsCat,TaskStandardTools, Shared_Publicly) %>% 
  select(Response.ID,
         Shared_Publicly,  
         CurrentPosition, 
         I_can_handle_RD_community_standards,
         I_have_proficiency_in_RDM,
         Iam_highly_knowledgeable_in_RDM,
         I_know_RDM_available_Methods,
         I_have_RDM_personal) %>%
  unique()
  
  
#  mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " agree"), "Agree")) %>% 
#  mutate(SharingProblems = replace(SharingProblems, str_detect(SharingProblems, " disagree"), "Disagree")) %>% 
  
  #filter(!grepl('Comment|Other',CurrentPositionCat)) %>% 
  #filter( CurrentPositionCat != "No") %>%
  #filter(TaskStandardTools != 'No') %>%
  #filter(TaskStandardTools != 'No') %>% 
  #na.omit() %>%  
  #unique() #%>% 
#droplevels()

df = temp
df <- data.frame(lapply(df, function(x) { gsub("Fully agree", "Agree", x)}))
df <- data.frame(lapply(df, function(x) { gsub("Rather agree", "Agree", x)}))
df <- data.frame(lapply(df, function(x) { gsub("Rather disagree", "Disagree", x)}))
df <- data.frame(lapply(df, function(x) { gsub("Fully disagree", "Disagree", x)}))


# calc relative frequency to make more own plots
for(s in seq(3,length(df),1)){
  print(colnames(df)[s])
  df_tmp = df[,c(1,2,s)]

  temp_relFreq = df_tmp %>% 
    group_by_at(vars(-Response.ID, Shared_Publicly)) %>% 
    summarise(n = n()) %>% 
    filter(n >= 3) %>%
    na.omit()  %>%
    mutate(percent = round(n/sum(n)*50,0))
  
  print(temp_relFreq)
  cat("Agree and Shared = ", temp_relFreq$n[temp_relFreq$Shared_Publicly=="Yes" & temp_relFreq[,2]=="Agree"] / temp_relFreq$n[temp_relFreq$Shared_Publicly=="Yes" & temp_relFreq[,2]=="Disagree"] )
  
  
}
# diese resultierenden Tables habe ich mir dann per hand angeschaut 
# und das Verhaeltnis zwischen Yes Agree und Yes Disagree ausgerechnet
# und ins paper geschrieben




######################################################################
######################################################################
######################################################################
######################################################################
######################################################################
######################################################################
######################################################################
######################################################################
######################################################################









# recreate different datasets
# more elegant
data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('For.which.of.these.tasks.'),
                                     starts_with('To.what.degree.do.you.'),
                                     starts_with('Think.of.re.using.data.'),
                                     starts_with('Think.of.data.sharing.')
)

comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Think.of.data.sharing.',
                    'Have.you.shared.data.with')
comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'ThinkSharingData',
                   'DataSharing')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
temp = data %>% 
  select(Response.ID,TaskStandardToolsCat,TaskStandardTools,DataSharingCat,DataSharing) %>% 
  filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  
  unique() %>% 
  droplevels()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))



###
# DAta Sharing
#temp = data %>% select(Response.ID,DataSharingCat,DataSharing) %>% filter(DataSharing != 'No') %>%
#  na.omit() %>% unique() %>% group_by(DataSharing) %>% filter(n() >= 3)

# calc abs numbers to make more own plots
temp_absNumbers = temp %>% group_by_at(vars(TaskStandardToolsCat, TaskStandardTools)) %>% summarise(n = n()) %>% mutate(percent = n /sum(n))
temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n())
temp_relFreq = temp %>% group_by_at(vars(-Response.ID, -DataSharing)) %>% summarise(n = n()) %>% mutate(percent = n / 144)

pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=DataSharingCat,y=n),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  xlab('') + ylab('') + ggtitle(paste0('Datasharing (n = ',sum(temp_absNumbers$n),')')) +
  geom_text(aes(x=DataSharingCat,y = n, label = n), colour = "white",hjust=1.5) +
  scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels


pFD
tiff('Fig2_DataSharing.tiff', width = 17.5, height = 7, units = "cm", res = 600)
pFD
dev.off()




pTST = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(TaskStandardTools~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('For.which.of.these.tasks.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300)
pTST
dev.off()

# make a Task Standard Tools Degree plot 
temp = data %>% select(Response.ID,TaskStandardToolsDegreeCat,TaskStandardToolsDegree,DataSharingCat,DataSharing) %>% filter(!grepl('Other',TaskStandardToolsDegreeCat)) %>% 
  na.omit() %>%  unique()
# combine some levels
temp$TaskStandardToolsDegree = as.character(temp$TaskStandardToolsDegree)
temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'As much as possible' | temp$TaskStandardToolsDegree == 'Mostly'] = 'Offten'
temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'Occasionally' | temp$TaskStandardToolsDegree == 'This is not relevant for my scientific work'] = 'Rare'
temp$TaskStandardToolsDegree = as.factor(temp$TaskStandardToolsDegree)

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTSD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=TaskStandardToolsDegreeCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(TaskStandardToolsDegree~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('To.what.degree.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300)
pTSD
dev.off()

# make a Think of Reusing Data plot 
temp = data %>% select(Response.ID,ThinkReusingDataCat,ThinkReusingData,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTRD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=ThinkReusingDataCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(ThinkReusingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Think.of.re.using.data.from.repositories.png', width = 30, height = 20, units = "cm", res = 300)
pTRD
dev.off()

# make a Think of Sharing Data plot 
temp = data %>% select(Response.ID,ThinkSharingDataCat,ThinkSharingData,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTSD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=ThinkSharingDataCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(ThinkSharingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Think.of.sharing.with.researchers.who.are.NOT.direct.collaborators.png', width = 30, height = 20, units = "cm", res = 300)
pTSD
dev.off()




####################################################################















# make a Fimiliar Data Types plot
# Fimiliar Data Types are Yes/No questions ==> just need the ones who answered with Yes
temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes) %>% filter(FimilarDataTypes != 'No') %>%
  na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 3)
# calc abs numbers to make more own plots
temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n())

pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=FimilarDataTypesCat,y=n),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  xlab('') + ylab('') + ggtitle(paste0('Fimilar Datatypes \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) +
  geom_text(aes(x=FimilarDataTypesCat,y = n, label = n), colour = "white",hjust=1.5) +
  scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

#png('Please.state.if.your.work.includes.png', width = 30, height = 20, units = "cm", res = 300)
pFD
#dev.off()

###
# DAta availability
temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes) %>% filter(FimilarDataTypes != 'No') %>%
  na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 3)
# calc abs numbers to make more own plots
temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n())

pFD = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=FimilarDataTypesCat,y=n),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  xlab('') + ylab('') + ggtitle(paste0('Fimilar Datatypes \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) +
  geom_text(aes(x=FimilarDataTypesCat,y = n, label = n), colour = "white",hjust=1.5) +
  scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels









data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('For.which.of.these.tasks.'),
                                     starts_with('To.what.degree.do.you.'),
                                     starts_with('Think.of.re.using.data.'),
                                     starts_with('Think.of.data.sharing.')
)

comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Think.of.data.sharing.',
                    'Have.you.shared.data.with')
comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'ThinkSharingData',
                   'DataSharing')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0
# STandard Tools
# make a Current Position plot
# TaskStandardToolsCat,TaskStandardTools
# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
#temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
#  #filter(TaskStandardTools != 'No') %>% 
#  na.omit() %>%  unique() %>% droplevels()
temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  unique() %>% 
  group_by(TaskStandardTools) %>% 
  filter(n() >= 3)


# temp <- factor(temp$TaskStandardToolsCat, levels = temp$TaskStandardToolsCat)

# = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools)  %>% filter(TaskStandardTools != 'No') %>%
#  na.omit() %>% unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3)

# calc abs numbers to make more own plots
temp_absNumbers = temp %>% 
  group_by_at(vars(-Response.ID)) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n))

pST = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=n),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  xlab('') + ylab('') + ggtitle(paste0('Use of Standard Tools \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) +
  geom_text(aes(x=TaskStandardToolsCat,y = n, label = n), colour = "white",hjust=1.5) +
  scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

pST
tiff('UseOfStandardTools.tiff', width = 30, height = 20, units = "cm", res = 300)
pST
dev.off()





######################################################################
######################################################################







temp_absNumbers = neuro_data_tmp %>% 
  na.omit %>% 
  filter(!Existing_Data == 'I have no datasets') %>%
  group_by_at(vars(Other_can_answer)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

cat("Von den Antwortenden die mindestens einen Datensatz haben ... ")
cat( temp_absNumbers$n[3], '(', round(temp_absNumbers$percent[3],3)*100, '%) , of all respondents that have at least one dataset are of the opinion that other researchers could answer their own research questions by re-using data from their research.') 

temp_absNumbers = neuro_data_tmp %>% 
  na.omit %>% 
  #filter(!Existing_Data == 'I have no datasets') %>%
  filter(Other_can_answer=='Yes') %>%
  group_by_at(vars(Shared_Publicly)) %>% 
  summarise(n = n()) %>% 
  mutate(percent = n / sum(n))

S1 = 'However, even for this subgroup, of scientists in possession of data of which they think are valuable to others '
S2 = '% have never shared any of their data publicly.'
cat(S1, round(temp_absNumbers$percent[1],3)*100, S2)



#############################################################




# von denen die DAten haben wieviele denken das diese DAten auch fuer andere Sinnvoll sind?


#q1 = 'Think.of.data.sharing.with.researchers.who.are.NOT.direct.collaborators..Please.indicate...Other.researchers.could.answer.their.own.research.questions.by.re.using.data.from.my.research..'
#data3 = data[data$Existing_Data!=NA,]









#Factors promoting public data sharing
# To identify factors that promote public data sharing answers of participants 
# were filtered on whether they have already shared their data in 
# public repositories or not. We excluded all subject which did not have any data. 

#1. delete subjects which did not have any data

#vec <- c("I have no datasets")
#data1 <- data0[data0$Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse. %in% vec,]

data1 <- neuro_data %>% filter(! Do.you.have.existing.data.sets..experiments..that.should.be.kept.alive.by.making.them.available.for.reuse. == "I have no datasets")


x<-colnames(neuro_data) 
typeof(x)
length(x)


data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('For.which.of.these.tasks.'),
                                     starts_with('To.what.degree.do.you.'),
                                     starts_with('Think.of.re.using.data.'),
                                     starts_with('Think.of.data.sharing.')
)

comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Think.of.data.sharing.',
                    'Have.you.shared.data.with')
comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'ThinkSharingData',
                   'DataSharing')



data0 = data1 %>% dplyr::select(Response.ID,
                                starts_with('Have.you.shared.data.with.....Publicly.'),
                                starts_with('My.current..primary..position.is'),
                                starts_with('Think.of.re.using.data.'),
                                starts_with('How.much.time.do.you.currently.need')
)

#data0 <- data0 %>% filter()
comb_string_vec = c('Think.of.re.using.data.')

comb_col_names = c('ThinkREusingData')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0
datax <-melt(data, id = c())

####################################
# Time needed for ready dataset
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
temp = data %>% select(Response.ID,TimeNeededCat,TimeNeeded,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TimeNeededCat)) %>% 
  #filter(TimeNeededCat != 'No') %>% 
  na.omit() %>%  unique() %>% droplevels()
#temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
#  filter(TaskStandardTools != 'No') %>% 
#  na.omit() %>%  unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3)
# = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools)  %>% filter(TaskStandardTools != 'No') %>%
#  na.omit() %>% unique() %>% group_by(TaskStandardTools) %>% filter(n() >= 3)


View(temp)

# calc abs numbers to make more own plots
temp_absNumbers = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n())

pST = ggplot(data=temp_absNumbers) + 
  geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=n),
                 colour = 'darkblue', fill='darkblue',
                 stat = 'identity',
                 width = 0.5) +
  coord_flip() +
  xlab('') + ylab('') + ggtitle(paste0('Use of Standard Tools \n(n = ',sum(temp_absNumbers$n),', multiple answers possible)')) +
  geom_text(aes(x=TaskStandardToolsCat,y = n, label = n), colour = "white",hjust=1.5) +
  scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

pST

tiff('UseOfStandardTools.tiff', width = 17.5, height = 7, units = "cm", res = 600)
pST
dev.off()

################old##########################
#############################################
# more elegant
data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('Do.you.have.existing.data'),
                                     starts_with('I.work.at'),
                                     starts_with('My.current..'),
                                     starts_with('Which.neuroscience.discipline.s.'),
                                     starts_with('Please.state.if.your.')
)
comb_string_vec = c('I.work.at',
                    'My.current..',
                    'Which.neuroscience.discipline.s.',
                    'Please.state.if.your.',
                    'Have.you.shared.data.with',
                    'Do.you.have.existing.data')
comb_col_names = c('WorkPlaces',
                   'CurrentPosition',
                   'NeuroDiscipline',
                   'FimilarDataTypes',
                   'DataSharing',
                   'ExistingData')

# Diese Schleife ist mit Vorsicht zu genieÃŸen. Entstehende long format Datensatz kann sehr groÃŸ werden
library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a WorkPlaces plot filter out the 'Other' answers
temp = data %>% select(Response.ID,WorkPlacesCat,WorkPlaces) %>% na.omit() %>% unique() %>% group_by(WorkPlaces) %>% filter(n() >= 3)

pWP = ggplot(data=temp) + 
  geom_bar(mapping=aes(x=WorkPlaces,y=..count..),position=position_dodge()) +
  xlab('') + ylab('count') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1), plot.margin=unit(c(0,0,0,0), 'cm')) +
  #facet_grid(.~DataSharingCat,scales = 'fixed',margins = FALSE) +
  scale_fill_brewer(palette = 'Accent') +
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('I.work.at.png', width = 30, height = 20, units = "cm", res = 300)
pWP
dev.off()

# make a Current Position plot
temp = data %>% select(Response.ID,CurrentPosition,DataSharingCat,DataSharing) %>% na.omit() %>% unique() %>% group_by(CurrentPosition) %>% filter(n() >= 3)
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pCP = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=CurrentPosition,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(.~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('My.current.position.png', width = 30, height = 20, units = "cm", res = 300)
pCP
dev.off()


# make a Neuro Discipline plot
# Neuro Discipline questions are Yes/No questions ==> just need the ones who answered with Yes ==> or?
temp = data %>% select(Response.ID,NeuroDisciplineCat,NeuroDiscipline,DataSharingCat,DataSharing) %>% filter(NeuroDiscipline != 'No') %>%
  na.omit() %>% unique() %>% group_by(NeuroDiscipline) %>% filter(n() >= 5)
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pND = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=NeuroDisciplineCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(NeuroDiscipline~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Which.neuroscience.discipline.png', width = 30, height = 20, units = "cm", res = 300)
pND
dev.off()


# make a Fimiliar Data Types plot
# Fimiliar Data Types are Yes/No questions ==> just need the ones who answered with Yes ==> or?
temp = data %>% select(Response.ID,FimilarDataTypesCat,FimilarDataTypes,DataSharingCat,DataSharing) %>% #filter(FimilarDataTypes != 'No') %>%
  na.omit() %>% unique() %>% group_by(FimilarDataTypes) %>% filter(n() >= 5)
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pFD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=FimilarDataTypesCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(FimilarDataTypes~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Please.state.if.your.work.includes.png', width = 30, height = 20, units = "cm", res = 300)
pFD
dev.off()


# recreate different datasets
# more elegant
data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('For.which.of.these.tasks.'),
                                     starts_with('To.what.degree.do.you.'),
                                     starts_with('Think.of.re.using.data.'),
                                     starts_with('Think.of.data.sharing.')
)

comb_string_vec = c('For.which.of.these.tasks.',
                    'To.what.degree.do.you.',
                    'Think.of.re.using.data.',
                    'Think.of.data.sharing.',
                    'Have.you.shared.data.with')
comb_col_names = c('TaskStandardTools',
                   'TaskStandardToolsDegree',
                   'ThinkReusingData',
                   'ThinkSharingData',
                   'DataSharing')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a Task Standard Tools plot
# Standard Task Tools are Yes/No questions ==> just need the ones who answered with Yes ==> or?
# remove Comment columes
temp = data %>% select(Response.ID,TaskStandardToolsCat,TaskStandardTools,DataSharingCat,DataSharing) %>% filter(!grepl('Comment|Other',TaskStandardToolsCat)) %>% 
  #filter(TaskStandardTools != 'No') %>% 
  na.omit() %>%  unique() %>% droplevels()
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTST = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=TaskStandardToolsCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(TaskStandardTools~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('For.which.of.these.tasks.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300)
pTST
dev.off()

# make a Task Standard Tools Degree plot 
temp = data %>% select(Response.ID,TaskStandardToolsDegreeCat,TaskStandardToolsDegree,DataSharingCat,DataSharing) %>% filter(!grepl('Other',TaskStandardToolsDegreeCat)) %>% 
  na.omit() %>%  unique()
# combine some levels
temp$TaskStandardToolsDegree = as.character(temp$TaskStandardToolsDegree)
temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'As much as possible' | temp$TaskStandardToolsDegree == 'Mostly'] = 'Offten'
temp$TaskStandardToolsDegree[temp$TaskStandardToolsDegree == 'Occasionally' | temp$TaskStandardToolsDegree == 'This is not relevant for my scientific work'] = 'Rare'
temp$TaskStandardToolsDegree = as.factor(temp$TaskStandardToolsDegree)

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTSD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=TaskStandardToolsDegreeCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(TaskStandardToolsDegree~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('To.what.degree.do.you.use.available.tools.or.standards.png', width = 30, height = 20, units = "cm", res = 300)
pTSD
dev.off()

# make a Think of Reusing Data plot 
temp = data %>% select(Response.ID,ThinkReusingDataCat,ThinkReusingData,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTRD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=ThinkReusingDataCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(ThinkReusingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Think.of.re.using.data.from.repositories.png', width = 30, height = 20, units = "cm", res = 300)
pTRD
dev.off()

# make a Think of Sharing Data plot 
temp = data %>% select(Response.ID,ThinkSharingDataCat,ThinkSharingData,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pTSD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=ThinkSharingDataCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(ThinkSharingData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Think.of.sharing.with.researchers.who.are.NOT.direct.collaborators.png', width = 30, height = 20, units = "cm", res = 300)
pTSD
dev.off()

### where are the problems ###
# recreate different datasets
# more elegant
data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('Please.indicate.'),
                                     starts_with('How.do.you.process.and.analyze.your.data.'),
)

comb_string_vec = c('Please.indicate.',
                    'How.do.you.process.and.analyze.your.data.',
                    'Have.you.shared.data.with')
comb_col_names = c('SharingProblems',
                   'HowAnalyzeData',
                   'DataSharing')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a Sharing Data Problems plot 
temp = data %>% select(Response.ID,SharingProblemsCat,SharingProblems,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()
# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pSP = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=SharingProblemsCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(SharingProblems~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,70})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Sharing.problems.please.indicate.png', width = 40, height = 30, units = "cm", res = 300)
pSP
dev.off()

# make a Sharing Data Problems plot 
temp = data %>% select(Response.ID,HowAnalyzeDataCat,HowAnalyzeData,DataSharingCat,DataSharing) %>% #filter(HowAnalyzeData != 'No') %>% 
  na.omit() %>%  unique() %>% group_by(HowAnalyzeData) %>% filter(n() >= 5)

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pHAD = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=HowAnalyzeDataCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(HowAnalyzeData~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=30)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,30})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('How.do.you.process.and.analyze.your.data.png', width = 30, height = 20, units = "cm", res = 300)
pHAD
dev.off()

# recreate different datasets
# more elegant
data0 = neuro_data %>% dplyr::select(Response.ID,
                                     starts_with('Have.you.shared.data.with'),
                                     starts_with('What.is.your.opinion'),
                                     starts_with('Applying.research.data.management.'),
                                     starts_with('Please.rank.the.top.'),
)

comb_string_vec = c('What.is.your.opinion',
                    'Please.rank.the.top.',
                    'Applying.research.data.management.',
                    'Have.you.shared.data.with')
comb_col_names = c('StatementsOpinion',
                   'TopSharingProblems',
                   'ApplyDataManagement',
                   'DataSharing')

library(data.table)
for(i in seq(1,length(comb_string_vec),1)){
  data0 = data.table::melt(as.data.table(data0),
                           id= c(which(!grepl(comb_string_vec[i],colnames(data0)))),
                           measure=list(grep(comb_string_vec[i],colnames(data0))),
                           variable.name = paste0(comb_col_names[i],'Cat'),
                           value.name = comb_col_names[i],value.factor=TRUE)
  
  # make some nicer labels
  data0 = as.data.frame(data0)
  level_strings = levels(data0[,ncol(data0)-1])
  
  # iterate over the level strings and update them
  for(s in seq(1,length(level_strings),1)){
    level_string = level_strings[s]
    temp = str_locate(level_string, '\\.\\.\\.')
    level_string = substr(level_string,temp[2],nchar(level_string))
    level_string = gsub('\\.|\\.\\.',' ',level_string)
    level_string = gsub('e g','e.g.',level_string)
    level_strings[s] = level_string
  }
  # reset the labels
  levels(data0[,ncol(data0)-1]) = level_strings
}
data = data0

# make a Top Sharing Data Problems plot 
temp = data %>% select(Response.ID,TopSharingProblemsCat,TopSharingProblems) %>% na.omit() %>%  unique()

pTSP = ggplot(data=temp) + 
  geom_bar(mapping=aes(x=TopSharingProblems,y=..prop..,group=1),
           width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(.~TopSharingProblemsCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=25)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,70})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Please.rank.the.top.most.pressing.issues.png', width = 40, height = 20, units = "cm", res = 300)
pTSP
dev.off()

# make a Apply Data Management plot 
temp = data %>% select(Response.ID,ApplyDataManagementCat,ApplyDataManagement,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pARM = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=ApplyDataManagementCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(ApplyDataManagement~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=25)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('Applying.research.data.management.png', width = 30, height = 20, units = "cm", res = 300)
pARM
dev.off()

# make a What is your opinion plot 
temp = data %>% select(Response.ID,StatementsOpinionCat,StatementsOpinion,DataSharingCat,DataSharing) %>% na.omit() %>%  unique()

# calc relative frequency to make more own plots
temp_relFreq = temp %>% group_by_at(vars(-Response.ID)) %>% summarise(n = n()) %>% mutate(share = n / sum(n))

pOS = ggplot(data=temp_relFreq) + 
  geom_histogram(mapping=aes(x=StatementsOpinionCat,y=share,color=DataSharing,fill=DataSharing),
                 stat = 'identity',
                 width = 0.5) +
  xlab('') + ylab('percent (%)') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
  theme(legend.position = "left", legend.box = "vertical") +
  facet_grid(StatementsOpinion~DataSharingCat,scales = 'fixed',margins = FALSE,labeller = label_wrap_gen(width=25)) +
  scale_color_brewer(palette = 'Accent') + scale_fill_brewer(palette = "Accent") + 
  scale_x_discrete(labels=function(x){gsub('(.{1,50})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels

png('What.is.your.opinion.on.the.following.statements.png', width = 30, height = 20, units = "cm", res = 300)
pOS
dev.off()

#### polar plot try ####
cbp1 <- c("#000000", "#FFFFFF")

temp = data %>% select(Response.ID,CurrentPosition,DataSharingCat,DataSharing) %>% na.omit() %>% unique()
pCP = ggplot(data=temp) +
  geom_bar(mapping=aes(x=CurrentPosition,color=DataSharing,fill=DataSharingCat),width = 0.75) +
  xlab('') + ylab('counts') +
  theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1), plot.margin=unit(c(0,0,0,0), 'cm')) +
  #facet_grid(.~DataSharingCat,scales = 'fixed',margins = FALSE) +
  scale_color_manual(values = cbp1) + scale_fill_brewer(palette = "Dark2") +  coord_polar(theta = 'y',clip='off') +
  scale_x_discrete(labels=function(x){gsub('(.{1,20})(\\s|$)', '\\1\n', x)}) # nice regular expression solution for multiple lined labels
pCP