jcolomb
/
SFB1315INFproject_open


			
			
				
					
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							#SFB data table preparation

library (dplyr)
## read spreadsheet
sfb_datainfo <- readxl::read_excel("~/Documents/Seafile/My Library/sfb-datainfo.xlsx",
skip = 1)
sfb_datainfo$data_size_TB <- as.numeric(sfb_datainfo$data_size_TB)
## for each subproject: data storage needed
#names(sfb_datainfo)
datastorage <-sfb_datainfo %>%
  group_by (`Project number`)%>%
  summarise(sum(data_size_TB))
a <- sfb_datainfo %>%
  summarise(sum(data_size_TB, na.rm = TRUE))

datastorage2 = rbind (datastorage, c("Sum",paste0(a[1,1]) ))

pander::pandoc.table(datastorage2)

## For each data sort: list of project, raw data, metadata standard, repository
datatype <- sfb_datainfo %>%
  filter (`type of data (primary, secondary, simulated)` %in% c("primary", "derived"))%>%
  group_by(data_type,`Sensitive (human) data ?`) %>%
  summarise(data_type,`Raw data format` ,`Project number`) 

resulttable = datatype [0,]

resulttable$frequency_projects = NA
class (resulttable$frequency_projects) = "integer"
for (i in c(1:length(unique(datatype$data_type)))){
  temp = unique(datatype$data_type)[i]
  
  temp2= datatype[datatype$data_type == temp,] 

  resulttable [i,1]= temp
  resulttable [i,2]= temp2[1,2]
  resulttable [i,3]= unite (as.data.frame(t(temp2[,3])), "all",sep= ", ", na.rm = TRUE)[1,1]
  resulttable [i,5] = nrow(temp2)
}
resulttable = resulttable %>% arrange (-frequency_projects) %>% select (-`Project number`)
View(resulttable)
write.csv(resulttable, file ="04_data_analysis/survey_data_analysis/datatype.csv")
write.csv(datastorage2,file ="04_data_analysis/survey_data_analysis/datasize.csv")