library(readr) library(dplyr) library(tidyr) library(purrr) library(forcats) library(ggplot2) library(stringi) library(LexOPS) library(ggExtra) library(gghalves) library(patchwork) library(overlapping) library(parallel) library(stringdist) n_cores <- 12 # import stimuli data ----------------------------------------------------- boss <- read_csv("boss.csv") %>% # lowercase words mutate(modal_name = stri_trans_tolower(modal_name)) %>% # percentages as numbers mutate_at(vars(dplyr::matches("perc")), function(x) as.numeric(sub("%", "", x))) %>% # no spaces filter(!grepl(" ", modal_name)) %>% # join to LexOPS full_join(lexops, by = c("modal_name" = "string")) %>% # make note of whether a picture's modal name mutate(is_pic = ifelse(is.na(filename), "no", "yes")) swow <- read_csv("swow_ppmi.csv") # useful functions for selecting stimuli ---------------------------------- # function to calculate cosine similarity calc_cos <- function(a, b) { if (length(a) != length(b)) stop("Inconsistent vector lengths") sum(a * b) / sqrt(sum(a^2)*sum(b^2)) } # function to calculate associative strength for SWOW assoc_str <- function(cues_a, cues_b, .swow = swow, print = FALSE, sim_measure = "R123.Strength") { if (length(cues_a) != length(cues_b)) stop("Inconsistent vector lengths") print_perc_vals <- seq(0, 100, by = 0.1) print_iters <- if (print) { lapply(1:length(cues_a), function(i) { perc_done <- (i / length(cues_a)) * 100 if (round(perc_done, 1) %in% print_perc_vals) { print_perc_vals <<- print_perc_vals[print_perc_vals != round(perc_done, 1)] i } else { NULL } }) } else { c() } sapply(1:length(cues_a), function(i) { cues_a_i <- as.character(cues_a[[i]]) cues_b_i <- as.character(cues_b[[i]]) if (identical(cues_a_i, cues_b_i)) { cos_i <- if (cues_a_i %in% .swow$cue) 1 else NA } else if (!all(c(cues_a_i, cues_b_i) %in% .swow$cue)) { cos_i <- NA } else { cues_neighbours <- .swow %>% dplyr::filter(cue %in% c(cues_a_i, cues_b_i)) %>% dplyr::mutate(cue = dplyr::case_when( cue == cues_a_i ~ "cue_a", cue == cues_b_i ~ "cue_b" )) %>% select(cue, response, !!dplyr::sym(sim_measure)) %>% tidyr::pivot_wider(names_from = cue, values_from = !!dplyr::sym(sim_measure), names_prefix = "p_") %>% #dplyr::filter(!is.na(p_cue_a), !is.na(p_cue_b)) tidyr::replace_na(list(p_cue_a = 0, p_cue_b = 0)) cos_i <- calc_cos(cues_neighbours$p_cue_a, cues_neighbours$p_cue_b) } if (print & i %in% print_iters) { perc_done <- i / length(cues_a) * 100 cat(sprintf("%i/%i (%.1f%%), \"%s\" ~ \"%s\" = %s\n", i, length(cues_a), round(perc_done, 1), cues_a_i, cues_b_i, cos_i)) } cos_i }) } # assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle")) # assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"), sim_measure = "ppmi") # function to get semantic similarity for swow (direct neighbours) cos_sim <- function(matches, target) { if (all(matches==target)) return(rep(1, length(matches))) assoc_str(rep(target, length(matches)), matches, sim_measure = "ppmi") } # cos_sim(c("cat", "teacher", "jungle", "dog"), "cat") # x <- cos_sim(sample(lexops$string, 5000), "cat") # function for maximising levenshtein distance, assuming that length is matched exactly # (returns value of 1 if maximum distance possible, otherwise 0) maximise_lev_dist <- function(xsource, targets) { dists <- stringdist(a=xsource, b=targets, method="lv") as.numeric(dists==nchar(xsource)) } # LexOPS pipeline --------------------------------------------------------- stim_seed <- 1 # words which are much more likely to be produced by Americans/Canadians americanisms <- c("squash", "airplane", "candy", "bison", "buffalo", "sidewalk", "camper", "motel", "checkers", "store", "soccer", "trash", "burner", "boardwalk", "automobile", "cellphone", "tombstone", "mailbox", "panties", "cab", "yogurt", "pants") # words which are inappropriate and distracting inappropriate <- c("piss", "penis", "sperm", "cannabis", "poop", "breast", "marijuana", "blowjob", "boob", "porn", "pussy", "bosom", "vagina", "muff", "breasts", "ass", "pee", "rump", "vulva") # words which are unimageable, or describe things whose images would be very different to those in the normed set (e.g. 'library', when there are no buildings in the set) or are not actually nouns unimageable <- c("siding", "cemetery", "polo", "woods", "bowel", "lobby", "bladder", "library", "zoo", "volcano", "apartments", "pasture", "peroxide", "temple", "weep", "sunrise", "storm", "mosque", "indigo", "bookshop", "pharmacy", "bakery", "orphanage", "sunset", "meadow", "venom", "brewery", "campus", "bulletin", "canal", "ranch", "salon", "yoga", "canteen", "livestock", "mammal", "casino", "subway", "skies", "vaccine", "placenta", "landfill", "jump", "snapshot", "vault", "badminton", "vessel", "troll", "surface", "parasite", "asylum", "sowing", "barefoot", "swimming", "arcade", "town", "shaft", "shore", "birth", "catcher", "waltz", "stomp", "spa", "animal", "cattle", "inch", "jungle", "attire", "postage", "poultry", "station", "pregnancy", "beach", "waist", "midget", "burp", "skyline", "workplace", "judo", "primate", "yawn", "trench", "bum", "injury", "shipment", "karaoke", "workshop", "chapel", "orchard", "runway", "saloon", "womb", "laughter", "cinema", "insulin", "chlorine", "stream", "movie", "chord", "piles", "rainy", "bacteria", "ravine", "beast", "motorway", "monsoon", "terrace", "woodwind", "jail", "retina", "planet", "wetland", "morphine", "cafe", "siren", "plantation", "sunlight", "meter", "print", "mucus", "sleet", "visa", "narcotics", "steroids", "urine", "blink", "appliance", "turquoise", "voice", "racing", "uterus", "chime", "slap", "nod", "pet", "bottom", "nudge", "army", "apartment", "village", "nightclub", "morgue", "kick", "city", "cellar", "outdoors", "squint", "booty", "lab", "street", "streets", "shock", "massacre", "item", "yard", "carnival", "structure", "prairie", "nudity", "shred", "nursery", "belch", "tint", "kilometer", "kilogram", "beep", "opera", "pageant", "swatter", "booze", "wiring", "vomit", "rodeo", "surf", "buffet", "bite", "chant", "swine", "mansion", "mill", "shack", "cabin", "alley", "lump", "ballot", "supper", "material", "lobe", "steel", "venison", "beef", "butt", "liquor", "pop", "office", "museum", "lookout", "port", "vitamin", "tee", "church", "labyrinth", "utensil") # words which describe people (no images of people in the normed set of images, only parts like 'hand') people <- c("tailor", "landlord", "waitress", "lifeguard", "parent", "swimmer", "cashier", "actor", "vendor", "bride", "broker", "salesman", "sailor", "pilot", "pirate", "redhead", "tutor", "runner", "brunette", "adult", "wizard", "chaps", "lawyer", "beekeeper", "priest", "kids", "uncle", "jockey", "surfer", "tenant", "violinist", "jury", "babysitter", "milkman", "astronaut", "beggar", "infant", "child", "robber", "baby", "talker", "tradesman", "lad", "magician", "plumber", "captain", "shooter", "nun", "prostitute", "pastor", "chauffeur", "rapper", "assassin", "bachelor", "stewardess", "visitor", "chairman", "chairwoman", "buyer", "sons", "banker", "veteran", "niece", "twins", "lady", "skater", "wrestler", "actress", "birth", "slave", "bomber", "boxer", "musician", "inmate", "worker", "kid", "carpenter", "physician", "blacksmith", "poet", "tribe", "shopper", "pilgrim", "ambassador", "chemist", "citizen", "male", "analyst", "chorus", "caveman", "butler", "brigade", "publisher", "roommate", "jogging", "mistress", "writer", "accountant", "mum", "attendant", "seaman", "monarch", "audience", "umpire", "brother", "typist", "scout", "assistant", "witch", "singer", "superhero", "troops", "pupil", "waiter", "troop", "pianist", "bodyguard", "biker", "choir", "surgeon", "men", "maid", "quarterback", "model", "officer", "mermaid", "teacher", "stepmother", "lecturer", "thief", "student", "reporter", "librarian", "nurse", "stylist", "realtor", "boy", "operator", "therapist", "platoon", "mother", "ladies", "men", "wife", "husband", "wives", "husbands", "nomad", "proprietor", "steward") # shortened words which may otherwise appear twice shortened <- c("rhino", "limo", "chimp", "scuba", "bike") # alternate versions of the same word, e.g. tomb and tombstone (plurals are okay, but this prevents repeating a word, e.g. 'spice' and 'spices') alternates <- c("spices", "tomb", "nostril", "needles", "chips", "levers", "motorbike", "peanuts", "trucks", "stairway", "meteor", "stair", "snails", "boots", "kitty", "piggy", "liqueur", "weed", "tummy", "ropes", "bumblebee") # words which I think have high concreteness because they are common misspellings of concrete words misspellings <- c("canon") # modal names for images clearly incorrectly named incorrects <- c("nut", "trumpet", "tuba", "spinach") # plural words (as all images are single objects) plurals <- c("sticks", "bees", "buttons", "strings", "ceramics", "tables", "arms", "molasses", "cereals", "mice", "brushes") # incongruent matches which have been excluded as they were possible descriptions for their image plausible <- c( "rubber", # was a match for "statue", which could also be a small rubber "buck", # was a match for a picture of a book (homophone) "marrow", # was a match for pickle "logo", # just about anything could be a logo "kit" # was a match for a baseball cap ) stim <- boss %>% filter( (is_pic == "yes" & dupe_pref) | is_pic=="no", modal_name %in% swow$cue, PK.Brysbaert >= 0.9, PoS.SUBTLEX_UK == "noun", CNC.Brysbaert >= 4, !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible) ) %>% mutate(is_pic = as.factor(is_pic)) %>% rename(string = "modal_name") %>% split_by(is_pic, "yes" ~ "no") %>% # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity # -1:-0.99 (similarity of between 0 and 0.01) gives suitably dissimilar control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>% control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>% control_for(Length, 0:0) %>% control_for(CNC.Brysbaert, -0.25:0.25) %>% control_for(ON.OLD20, -0.75:0.75) %>% control_for(Zipf.SUBTLEX_UK, -0.125:0.125) %>% control_for(BG.SUBTLEX_UK, -0.0025:0.0025) %>% generate(200, seed = stim_seed) write_csv(stim, "stim.csv") stim_lev_dists <- stim %>% rowwise() %>% mutate(levenshtein_distance = stringdist(A1, A2, method="lv")) %>% ungroup() %>% select(item_nr, levenshtein_distance) stim_tidy <- stim %>% long_format("all") %>% rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>% pivot_wider(id_cols = item_nr, names_from = condition, values_from = c(string, filename, perc_name_agree, Zipf.SUBTLEX_UK, Length, cos_ppmi_sim, CNC.Brysbaert, BG.SUBTLEX_UK, nb_diff_names, ON.OLD20)) %>% select(item_nr, string_A1, filename_A1, perc_name_agree_A1, string_A2, everything(), -filename_A2, -perc_name_agree_A2) %>% left_join(stim_lev_dists, by="item_nr") # optimise split for counterbalancing ------------------------------------- message("Optimising counterbalance split") vars_to_match <- c("perc_name_agree_A1", "Zipf.SUBTLEX_UK_A1", "Zipf.SUBTLEX_UK_A2", "cos_ppmi_sim_A1", "cos_ppmi_sim_A2", "BG.SUBTLEX_UK_A1", "BG.SUBTLEX_UK_A2", "CNC.Brysbaert_A1", "CNC.Brysbaert_A2", "ON.OLD20_A1", "ON.OLD20_A2", "Length_A1", "Length_A2") # how many seeds to try max_seed <- 50000 cl <- makeCluster(n_cores) cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)}) clusterExport(cl, c("stim_tidy", "vars_to_match")) dist_overlaps <- parLapply(cl, 1:max_seed, function(seed_i) { cat(sprintf("%s \r", seed_i)) set.seed(seed_i) order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE) stim_seed_i <- mutate(stim_tidy, order_grp = order_grp_vec) tibble( seed_i = seed_i, var_j = vars_to_match, ov = map_dbl(vars_to_match, function(var_j) { var_grp_1 <- stim_seed_i %>% filter(order_grp == 1) %>% pull(var_j) %>% as.numeric() var_grp_2 <- stim_seed_i %>% filter(order_grp == 2) %>% pull(var_j) %>% as.numeric() ov <- overlapping::overlap(list(var_grp_1, var_grp_2)) ov$OV %>% unname() %>% as.numeric() }) ) }) %>% reduce(bind_rows) stopCluster(cl) dist_overlaps_summ <- dist_overlaps %>% group_by(seed_i) %>% summarise(sum = sum(ov), median = median(ov), mean = mean(ov), sd = sd(ov), min = min(ov)) %>% arrange(desc(mean), sd) # apply the split --------------------------------------------------------- dist_overlaps_summ %>% pull(seed_i) %>% first() %>% set.seed() order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE) stim_tidy <- mutate(stim_tidy, order_grp = order_grp_vec) write_csv(stim_tidy, "stim_tidy.csv") stim_tidy_long <- stim %>% long_format("all") %>% arrange(item_nr) %>% mutate( filename = ifelse(is.na(filename), lag(filename), filename), cos_ppmi_sim_tidy = as.numeric(ifelse(cos_ppmi_sim=="1", NA, cos_ppmi_sim)) ) %>% rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>% # also store the order group full_join(select(stim_tidy, item_nr, order_grp), by = "item_nr") write_csv(stim_tidy_long, "stim_tidy_long.csv") # select practice trials -------------------------------------------------- message("Selecting practice trials") prac_seed <- 111 set.seed(prac_seed) A1_trials <- sample(1:20, 10) practice_stim <- boss %>% filter( (is_pic == "yes" & dupe_pref) | is_pic=="no", modal_name %in% swow$cue, PK.Brysbaert >= 0.9, PoS.SUBTLEX_UK == "noun", CNC.Brysbaert >= 4, !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible), !modal_name %in% stim_tidy_long$string ) %>% mutate(is_pic = as.factor(is_pic)) %>% rename(string = "modal_name") %>% split_by(is_pic, "yes" ~ "no") %>% # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity # -1:-0.999 (similarity of between 0 and 0.001) gives suitably disimilar control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>% control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>% control_for(Length, 0:0) %>% generate(20, seed = prac_seed) practice_stim_long <- practice_stim %>% long_format("all") %>% mutate(filename = ifelse(is.na(filename), lag(filename), filename)) %>% # randomly allocate half the items to congruent condition, and half to incongruent, and only keep the relevant trials filter( (condition=="A1" & item_nr %in% A1_trials) | (condition=="A2" & !item_nr %in% A1_trials) ) # save the practice trials write_csv(practice_stim_long, "practice_stim.csv") # tidy picture-word stimuli figures --------------------------------------- itemwise_vars_for_dens <- c("Length", "Zipf.SUBTLEX_UK", "BG.SUBTLEX_UK", "CNC.Brysbaert", "ON.OLD20") distwise_vars_for_dens <- c("perc_name_agree", "cos_ppmi_sim_tidy") # plt_cols <- c("#007EFF", "#FF007E") plt_cols <- c("#E69F00", "#56B4E9") counterbalanced_grps_label <- c("Set 1", "Set 2") fontsize <- 12 # Format data plt_stim_itemwise <- stim_tidy_long %>% mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>% pivot_longer(cols=all_of(itemwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>% mutate(condition_j = as.numeric(factor(condition))) %>% rowwise() %>% # add horizontal jitter to points and lines mutate(., condition_j = ifelse(condition=="A1", condition_j + runif(1, 0.1, 0.15), condition_j + runif(1, -0.15, -0.1))) %>% # mutate(., condition_j = ifelse(condition=="A1", condition_j + 0.15, condition_j -0.15)) %>% ungroup() %>% mutate( condition = ifelse(condition=="A1", "Picture\nCongruent", "Picture\nIncongruent"), plt_variable = factor(plt_variable, levels = itemwise_vars_for_dens), plt_variable = fct_recode( plt_variable, `Length` = "Length", `Zipf Frequency` = "Zipf.SUBTLEX_UK", `Bigram Probability` = "BG.SUBTLEX_UK", `Concreteness` = "CNC.Brysbaert", `OLD20` = "ON.OLD20" ) ) itemwise_plt <- plt_stim_itemwise %>% ggplot(aes(condition, plt_value, colour=factor(order_grp), fill=factor(order_grp))) + geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nCongruent"), alpha=0.5, trim=FALSE, side="l", position=position_identity(), show.legend=FALSE) + geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nIncongruent"), alpha=0.5, trim=FALSE, side="r", position=position_identity(), show.legend=FALSE) + geom_point(aes(x=condition_j), alpha=0.75, shape=16, size=0.8) + geom_line(aes(x=condition_j, group=item_nr), alpha=0.3) + scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) + scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols, guide = guide_legend(override.aes = list(alpha = 1, size=2, linewidth=1))) + scale_y_continuous(breaks = scales::pretty_breaks(5)) + facet_wrap(vars(plt_variable), scales="free_y") + labs( x = "\nCongruency Condition\n", y = "Value", tag = "a" ) + theme_classic() + theme( legend.position = c(0.85, 0.15), text = element_text(size=fontsize), # plot.title = element_text(hjust=-0.1), strip.background = element_blank(), legend.margin = margin(), plot.margin = margin(3,3,3,8, unit="pt") ) plt_stim_distwise <- stim_tidy_long %>% mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>% pivot_longer(cols=all_of(distwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>% mutate( plt_variable = factor(plt_variable, levels = distwise_vars_for_dens), plt_variable = fct_recode( plt_variable, `Percentage of Name Agreement` = "perc_name_agree", `Cosine PPMI Semantic Similarity` = "cos_ppmi_sim_tidy" ), plt_variable = fct_rev(plt_variable) ) custom_breaks <- function(x) { if (max(x) < 0.01) pretty(x, 3) else pretty(x, 5) } distwise_plt <- plt_stim_distwise %>% ggplot(aes(plt_value, colour=factor(order_grp), fill=factor(order_grp))) + geom_density(alpha=0.5, show.legend = FALSE) + facet_wrap(vars(plt_variable), scales="free") + labs( x = "Value", y = "Density", tag = "b" ) + scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) + scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) + scale_x_continuous(breaks = custom_breaks) + scale_y_continuous(breaks = scales::pretty_breaks(3)) + theme_classic() + theme( legend.position = "none", text = element_text(size=fontsize), # plot.title = element_text(hjust=-0.1), strip.background = element_blank(), legend.margin = margin(), plot.margin = margin(3,3,3,8, unit="pt") ) stim_summ <- (itemwise_plt / distwise_plt) + plot_layout( heights = c(5.5, 1), widths = 6.5, guides = "keep" ) ggsave(file.path("fig", "stimuli_summary.png"), stim_summ, height=5.5, width=6.5, device="png", type="cairo") ggsave(file.path("fig", "stimuli_summary.pdf"), stim_summ, height=5.5, width=6.5, device="pdf") # select localiser stimuli ------------------------------------------------ message("Selecting localiser stimuli") # misspelt words in the prevalence norms prev_misspellings <- c("yogurt") # words which might not be technically words nonwords <- c("yippee") max_seed <- 500000 set.seed(stim_seed) loc_seeds <- sample(1:2147483647, max_seed, replace=FALSE) match_vars <- c("Zipf.SUBTLEX_UK", "PREV.Brysbaert", "CNC.Brysbaert", "AoA.Kuperman", "VAL.Warriner", "AROU.Warriner", "DOM.Warriner", "ON.OLD20", "BG.SUBTLEX_UK", "Length", "RT.BLP", "Accuracy.BLP") loc_stim_pool <- lexops %>% filter( PK.Brysbaert >= 0.9, !is.na(Zipf.SUBTLEX_UK), !string %in% c(prev_misspellings, nonwords), ) %>% mutate(PoS.SUBTLEX_UK = as.character(PoS.SUBTLEX_UK)) %>% select(string, all_of(match_vars), PoS.SUBTLEX_UK) pos_cats <- unique(loc_stim_pool$PoS.SUBTLEX_UK) pos_match_vars <- sapply(pos_cats, function(x) sprintf("pos_%s", x)) pos_dum_vals <- map_dfc(pos_cats, function(x) { as.numeric(loc_stim_pool$PoS.SUBTLEX_UK == x) }) %>% set_names(pos_match_vars) loc_stim_pool <- bind_cols(loc_stim_pool, pos_dum_vals) cl <- makeCluster(n_cores) cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)}) clusterExport(cl, c("loc_stim_pool", "match_vars", "pos_match_vars", "stim_tidy_long", "practice_stim_long", "loc_seeds")) loc_seeds_res <- parLapply(cl, 1:max_seed, function(seed_i) { set.seed(loc_seeds[seed_i]) loc_stim_i <- loc_stim_pool %>% filter( !string %in% stim_tidy_long$string, !string %in% practice_stim_long$string ) %>% slice_sample(n = 100) ov_numeric <- map_dbl(match_vars, function(var_j) { pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])] sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])] ov <- overlapping::overlap(list(pool_vals, sample_vals)) ov$OV %>% unname() %>% as.numeric() }) ov_pos <- map_dbl(pos_match_vars, function(var_j) { pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])] sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])] ov <- overlapping::overlap(list(pool_vals, sample_vals)) ov$OV %>% unname() %>% as.numeric() }) bind_rows( tibble( seed = loc_seeds[seed_i], ov = ov_numeric, vars = match_vars, weight = 1 ), tibble( seed = loc_seeds[seed_i], ov = ov_pos, vars = pos_match_vars, weight = 1/8 ) ) %>% mutate( weight = case_when( vars == "Length" ~ 3, vars == "Zipf.SUBTLEX_UK" ~ 2, TRUE ~ weight ), ov_w = ov * weight ) %>% group_by(seed) %>% summarise(ov_sum = sum(ov_w)) }) %>% reduce(bind_rows) stopCluster(cl) best_seed <- loc_seeds_res %>% arrange(desc(ov_sum)) %>% top_n(1) %>% pull(seed) # recreate the best stimulus set set.seed(best_seed) loc_stim <- loc_stim_pool %>% filter( !string %in% stim_tidy_long$string, !string %in% practice_stim_long$string ) %>% slice_sample(n = 100) loc_stim_tidy <- loc_stim %>% rename(word=string) %>% mutate( item_nr = row_number() ) %>% select(item_nr, everything()) write_csv(loc_stim, "localiser_stim.csv") # plot the distributions for localiser stim ------------------------------- repres_cols <- c("#696867", "#5ba7d6") dens_pl <- bind_rows( mutate(loc_stim, type="Sample"), mutate(loc_stim_pool, type="Population") ) %>% pivot_longer(cols=c(match_vars[match_vars!="Length"]), names_to="var", values_to="value") %>% mutate(var_tidy = recode( var, Zipf.SUBTLEX_UK = "Zipf Frequency", BG.SUBTLEX_UK = "Bigram Probability", CNC.Brysbaert = "Concreteness", ON.OLD20 = "OLD20", VAL.Warriner = "Valence", AROU.Warriner = "Arousal", DOM.Warriner = "Dominance", AoA.Kuperman = "Age of Acquisition", PREV.Brysbaert = "Prevalence", RT.BLP = "Lexical Decision RT", Accuracy.BLP = "Lexical Decision Accuracy" )) %>% mutate(var_tidy = factor(var_tidy, levels=sort(unique(var_tidy)))) %>% ggplot(aes(value, fill=type, alpha=type)) + geom_density() + facet_wrap(vars(var_tidy), scales="free", nrow=4) + labs( x = "Value", y = "Density", tag = "a", fill = NULL ) + scale_fill_manual(values=repres_cols) + scale_alpha_manual(values=c(1, 0.5), guide=FALSE) + scale_x_continuous(guide = guide_axis(check.overlap = TRUE)) + theme_classic() + theme( legend.position = c(0.85, 0.1), axis.text.y=element_blank(), axis.ticks.y=element_blank(), strip.background = element_blank(), legend.margin = margin(), plot.margin = margin(3,3,3,3, unit="pt") ) dat_length <- bind_rows( mutate(loc_stim, type="Sample"), mutate(loc_stim_pool, type="Population") ) %>% mutate(var="Length", value=as.numeric(Length)) dat_pos <- bind_rows( loc_stim %>% mutate(type="Sample", max_n=nrow(.)), loc_stim_pool %>% mutate(type="Population", max_n=nrow(.)) ) %>% mutate(var="Part of Speech", value=PoS.SUBTLEX_UK) %>% group_by(type, value, var) %>% summarise(prop = n()/max_n) %>% distinct() %>% filter(value %in% loc_stim$PoS.SUBTLEX_UK) length_pl <- dat_length %>% ggplot(aes(value)) + geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[1]], alpha=1, data=filter(dat_length, type=="Population")) + geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[2]], alpha=0.5, data=filter(dat_length, type=="Sample")) + facet_wrap(vars(var), scales="free") + labs( x = "", y = "Proportion", tag = "b" ) + theme_classic() + theme( legend.position = "none", legend.margin = margin(), plot.margin = margin(3,3,3,3, unit="pt"), strip.background = element_blank() ) pos_pl <- dat_pos %>% ggplot(aes(value, prop)) + geom_col(position = "identity", fill=repres_cols[[1]], alpha=1, data=filter(dat_pos, type=="Population")) + geom_col(position = "identity", fill=repres_cols[[2]], alpha=0.5, data=filter(dat_pos, type=="Sample")) + facet_wrap(vars(var), scales="free") + labs( x = "", y = NULL ) + theme_classic() + theme( legend.position = "none", legend.margin = margin(), plot.margin = margin(3,3,3,3, unit="pt"), strip.background = element_blank() ) # loc_stim_summ <- (dens_pl / (length_pl | pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=-0.175)))) + # plot_layout(heights = c(5.8, 1.2), widths = c(6.5, 2.5, 4)) loc_stim_summ <- dens_pl + length_pl + (pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=0.18))) + plot_layout( heights = c(6, 1, 1), design = " AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA BBBBBBBBBBCCCCCCCCCCCCCCCCCCCC" ) # loc_stim_summ ggsave(file.path("fig", "localiser_stimuli_summary.png"), loc_stim_summ, height=5.5, width=6.5, device="png", type="cairo") ggsave(file.path("fig", "localiser_stimuli_summary.pdf"), loc_stim_summ, height=5.5, width=6.5, device="pdf") message("Stimuli done!")