12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- #SFB data table preparation
- library (dplyr)
- ## read spreadsheet
- sfb_datainfo <- readxl::read_excel("~/Documents/Seafile/My Library/sfb-datainfo.xlsx",
- skip = 1)
- sfb_datainfo$data_size_TB <- as.numeric(sfb_datainfo$data_size_TB)
- ## for each subproject: data storage needed
- #names(sfb_datainfo)
- datastorage <-sfb_datainfo %>%
- group_by (`Project number`)%>%
- summarise(sum(data_size_TB))
- a <- sfb_datainfo %>%
- summarise(sum(data_size_TB, na.rm = TRUE))
- datastorage2 = rbind (datastorage, c("Sum",paste0(a[1,1]) ))
- pander::pandoc.table(datastorage2)
- ## For each data sort: list of project, raw data, metadata standard, repository
- datatype <- sfb_datainfo %>%
- filter (`type of data (primary, secondary, simulated)` %in% c("primary", "derived"))%>%
- group_by(data_type,`Sensitive (human) data ?`) %>%
- summarise(data_type,`Raw data format` ,`Project number`)
- resulttable = datatype [0,]
- resulttable$frequency_projects = NA
- class (resulttable$frequency_projects) = "integer"
- for (i in c(1:length(unique(datatype$data_type)))){
- temp = unique(datatype$data_type)[i]
-
- temp2= datatype[datatype$data_type == temp,]
- resulttable [i,1]= temp
- resulttable [i,2]= temp2[1,2]
- resulttable [i,3]= unite (as.data.frame(t(temp2[,3])), "all",sep= ", ", na.rm = TRUE)[1,1]
- resulttable [i,5] = nrow(temp2)
- }
- resulttable = resulttable %>% arrange (-frequency_projects) %>% select (-`Project number`)
- View(resulttable)
- write.csv(resulttable, file ="04_data_analysis/survey_data_analysis/datatype.csv")
- write.csv(datastorage2,file ="04_data_analysis/survey_data_analysis/datasize.csv")
|