SFBdata_tableprep.R 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #SFB data table preparation
  2. library (dplyr)
  3. ## read spreadsheet
  4. sfb_datainfo <- readxl::read_excel("~/Documents/Seafile/My Library/sfb-datainfo.xlsx",
  5. skip = 1)
  6. sfb_datainfo$data_size_TB <- as.numeric(sfb_datainfo$data_size_TB)
  7. ## for each subproject: data storage needed
  8. #names(sfb_datainfo)
  9. datastorage <-sfb_datainfo %>%
  10. group_by (`Project number`)%>%
  11. summarise(sum(data_size_TB))
  12. a <- sfb_datainfo %>%
  13. summarise(sum(data_size_TB, na.rm = TRUE))
  14. datastorage2 = rbind (datastorage, c("Sum",paste0(a[1,1]) ))
  15. pander::pandoc.table(datastorage2)
  16. ## For each data sort: list of project, raw data, metadata standard, repository
  17. datatype <- sfb_datainfo %>%
  18. filter (`type of data (primary, secondary, simulated)` %in% c("primary", "derived"))%>%
  19. group_by(data_type,`Sensitive (human) data ?`) %>%
  20. summarise(data_type,`Raw data format` ,`Project number`)
  21. resulttable = datatype [0,]
  22. resulttable$frequency_projects = NA
  23. class (resulttable$frequency_projects) = "integer"
  24. for (i in c(1:length(unique(datatype$data_type)))){
  25. temp = unique(datatype$data_type)[i]
  26. temp2= datatype[datatype$data_type == temp,]
  27. resulttable [i,1]= temp
  28. resulttable [i,2]= temp2[1,2]
  29. resulttable [i,3]= unite (as.data.frame(t(temp2[,3])), "all",sep= ", ", na.rm = TRUE)[1,1]
  30. resulttable [i,5] = nrow(temp2)
  31. }
  32. resulttable = resulttable %>% arrange (-frequency_projects) %>% select (-`Project number`)
  33. View(resulttable)
  34. write.csv(resulttable, file ="04_data_analysis/survey_data_analysis/datatype.csv")
  35. write.csv(datastorage2,file ="04_data_analysis/survey_data_analysis/datasize.csv")