123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632 |
- library(readr)
- library(dplyr)
- library(tidyr)
- library(purrr)
- library(forcats)
- library(ggplot2)
- library(stringi)
- library(LexOPS)
- library(ggExtra)
- library(gghalves)
- library(patchwork)
- library(overlapping)
- library(parallel)
- library(stringdist)
- n_cores <- 12
- # import stimuli data -----------------------------------------------------
- boss <- read_csv("boss.csv") %>%
- # lowercase words
- mutate(modal_name = stri_trans_tolower(modal_name)) %>%
- # percentages as numbers
- mutate_at(vars(dplyr::matches("perc")), function(x) as.numeric(sub("%", "", x))) %>%
- # no spaces
- filter(!grepl(" ", modal_name)) %>%
- # join to LexOPS
- full_join(lexops, by = c("modal_name" = "string")) %>%
- # make note of whether a picture's modal name
- mutate(is_pic = ifelse(is.na(filename), "no", "yes"))
- swow <- read_csv("swow_ppmi.csv")
- # useful functions for selecting stimuli ----------------------------------
- # function to calculate cosine similarity
- calc_cos <- function(a, b) {
- if (length(a) != length(b)) stop("Inconsistent vector lengths")
- sum(a * b) / sqrt(sum(a^2)*sum(b^2))
- }
- # function to calculate associative strength for SWOW
- assoc_str <- function(cues_a, cues_b, .swow = swow, print = FALSE, sim_measure = "R123.Strength") {
-
- if (length(cues_a) != length(cues_b)) stop("Inconsistent vector lengths")
-
- print_perc_vals <- seq(0, 100, by = 0.1)
-
- print_iters <- if (print) {
- lapply(1:length(cues_a), function(i) {
- perc_done <- (i / length(cues_a)) * 100
- if (round(perc_done, 1) %in% print_perc_vals) {
- print_perc_vals <<- print_perc_vals[print_perc_vals != round(perc_done, 1)]
- i
- } else {
- NULL
- }
- })
- } else {
- c()
- }
-
- sapply(1:length(cues_a), function(i) {
-
- cues_a_i <- as.character(cues_a[[i]])
- cues_b_i <- as.character(cues_b[[i]])
-
- if (identical(cues_a_i, cues_b_i)) {
- cos_i <- if (cues_a_i %in% .swow$cue) 1 else NA
- } else if (!all(c(cues_a_i, cues_b_i) %in% .swow$cue)) {
- cos_i <- NA
- } else {
- cues_neighbours <- .swow %>%
- dplyr::filter(cue %in% c(cues_a_i, cues_b_i)) %>%
- dplyr::mutate(cue = dplyr::case_when(
- cue == cues_a_i ~ "cue_a",
- cue == cues_b_i ~ "cue_b"
- )) %>%
- select(cue, response, !!dplyr::sym(sim_measure)) %>%
- tidyr::pivot_wider(names_from = cue, values_from = !!dplyr::sym(sim_measure), names_prefix = "p_") %>%
- #dplyr::filter(!is.na(p_cue_a), !is.na(p_cue_b))
- tidyr::replace_na(list(p_cue_a = 0, p_cue_b = 0))
-
- cos_i <- calc_cos(cues_neighbours$p_cue_a, cues_neighbours$p_cue_b)
- }
-
- if (print & i %in% print_iters) {
- perc_done <- i / length(cues_a) * 100
- cat(sprintf("%i/%i (%.1f%%), \"%s\" ~ \"%s\" = %s\n", i, length(cues_a), round(perc_done, 1), cues_a_i, cues_b_i, cos_i))
- }
-
- cos_i
- })
- }
- # assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"))
- # assoc_str(c("cat", "cat", "cat"), c("cat", "teacher", "jungle"), sim_measure = "ppmi")
- # function to get semantic similarity for swow (direct neighbours)
- cos_sim <- function(matches, target) {
- if (all(matches==target)) return(rep(1, length(matches)))
- assoc_str(rep(target, length(matches)), matches, sim_measure = "ppmi")
- }
- # cos_sim(c("cat", "teacher", "jungle", "dog"), "cat")
- # x <- cos_sim(sample(lexops$string, 5000), "cat")
- # function for maximising levenshtein distance, assuming that length is matched exactly
- # (returns value of 1 if maximum distance possible, otherwise 0)
- maximise_lev_dist <- function(xsource, targets) {
- dists <- stringdist(a=xsource, b=targets, method="lv")
- as.numeric(dists==nchar(xsource))
- }
- # LexOPS pipeline ---------------------------------------------------------
- stim_seed <- 1
- # words which are much more likely to be produced by Americans/Canadians
- americanisms <- c("squash", "airplane", "candy", "bison", "buffalo", "sidewalk", "camper", "motel", "checkers", "store", "soccer", "trash", "burner", "boardwalk", "automobile", "cellphone", "tombstone", "mailbox", "panties", "cab", "yogurt", "pants")
- # words which are inappropriate and distracting
- inappropriate <- c("piss", "penis", "sperm", "cannabis", "poop", "breast", "marijuana", "blowjob", "boob", "porn", "pussy", "bosom", "vagina", "muff", "breasts", "ass", "pee", "rump", "vulva")
- # words which are unimageable, or describe things whose images would be very different to those in the normed set (e.g. 'library', when there are no buildings in the set) or are not actually nouns
- unimageable <- c("siding", "cemetery", "polo", "woods", "bowel", "lobby", "bladder", "library", "zoo", "volcano", "apartments", "pasture", "peroxide", "temple", "weep", "sunrise", "storm", "mosque", "indigo", "bookshop", "pharmacy", "bakery", "orphanage", "sunset", "meadow", "venom", "brewery", "campus", "bulletin", "canal", "ranch", "salon", "yoga", "canteen", "livestock", "mammal", "casino", "subway", "skies", "vaccine", "placenta", "landfill", "jump", "snapshot", "vault", "badminton", "vessel", "troll", "surface", "parasite", "asylum", "sowing", "barefoot", "swimming", "arcade", "town", "shaft", "shore", "birth", "catcher", "waltz", "stomp", "spa", "animal", "cattle", "inch", "jungle", "attire", "postage", "poultry", "station", "pregnancy", "beach", "waist", "midget", "burp", "skyline", "workplace", "judo", "primate", "yawn", "trench", "bum", "injury", "shipment", "karaoke", "workshop", "chapel", "orchard", "runway", "saloon", "womb", "laughter", "cinema", "insulin", "chlorine", "stream", "movie", "chord", "piles", "rainy", "bacteria", "ravine", "beast", "motorway", "monsoon", "terrace", "woodwind", "jail", "retina", "planet", "wetland", "morphine", "cafe", "siren", "plantation", "sunlight", "meter", "print", "mucus", "sleet", "visa", "narcotics", "steroids", "urine", "blink", "appliance", "turquoise", "voice", "racing", "uterus", "chime", "slap", "nod", "pet", "bottom", "nudge", "army", "apartment", "village", "nightclub", "morgue", "kick", "city", "cellar", "outdoors", "squint", "booty", "lab", "street", "streets", "shock", "massacre", "item", "yard", "carnival", "structure", "prairie", "nudity", "shred", "nursery", "belch", "tint", "kilometer", "kilogram", "beep", "opera", "pageant", "swatter", "booze", "wiring", "vomit", "rodeo", "surf", "buffet", "bite", "chant", "swine", "mansion", "mill", "shack", "cabin", "alley", "lump", "ballot", "supper", "material", "lobe", "steel", "venison", "beef", "butt", "liquor", "pop", "office", "museum", "lookout", "port", "vitamin", "tee", "church", "labyrinth", "utensil")
- # words which describe people (no images of people in the normed set of images, only parts like 'hand')
- people <- c("tailor", "landlord", "waitress", "lifeguard", "parent", "swimmer", "cashier", "actor", "vendor", "bride", "broker", "salesman", "sailor", "pilot", "pirate", "redhead", "tutor", "runner", "brunette", "adult", "wizard", "chaps", "lawyer", "beekeeper", "priest", "kids", "uncle", "jockey", "surfer", "tenant", "violinist", "jury", "babysitter", "milkman", "astronaut", "beggar", "infant", "child", "robber", "baby", "talker", "tradesman", "lad", "magician", "plumber", "captain", "shooter", "nun", "prostitute", "pastor", "chauffeur", "rapper", "assassin", "bachelor", "stewardess", "visitor", "chairman", "chairwoman", "buyer", "sons", "banker", "veteran", "niece", "twins", "lady", "skater", "wrestler", "actress", "birth", "slave", "bomber", "boxer", "musician", "inmate", "worker", "kid", "carpenter", "physician", "blacksmith", "poet", "tribe", "shopper", "pilgrim", "ambassador", "chemist", "citizen", "male", "analyst", "chorus", "caveman", "butler", "brigade", "publisher", "roommate", "jogging", "mistress", "writer", "accountant", "mum", "attendant", "seaman", "monarch", "audience", "umpire", "brother", "typist", "scout", "assistant", "witch", "singer", "superhero", "troops", "pupil", "waiter", "troop", "pianist", "bodyguard", "biker", "choir", "surgeon", "men", "maid", "quarterback", "model", "officer", "mermaid", "teacher", "stepmother", "lecturer", "thief", "student", "reporter", "librarian", "nurse", "stylist", "realtor", "boy", "operator", "therapist", "platoon", "mother", "ladies", "men", "wife", "husband", "wives", "husbands", "nomad", "proprietor", "steward")
- # shortened words which may otherwise appear twice
- shortened <- c("rhino", "limo", "chimp", "scuba", "bike")
- # alternate versions of the same word, e.g. tomb and tombstone (plurals are okay, but this prevents repeating a word, e.g. 'spice' and 'spices')
- alternates <- c("spices", "tomb", "nostril", "needles", "chips", "levers", "motorbike", "peanuts", "trucks", "stairway", "meteor", "stair", "snails", "boots", "kitty", "piggy", "liqueur", "weed", "tummy", "ropes", "bumblebee")
- # words which I think have high concreteness because they are common misspellings of concrete words
- misspellings <- c("canon")
- # modal names for images clearly incorrectly named
- incorrects <- c("nut", "trumpet", "tuba", "spinach")
- # plural words (as all images are single objects)
- plurals <- c("sticks", "bees", "buttons", "strings", "ceramics", "tables", "arms", "molasses", "cereals", "mice", "brushes")
- # incongruent matches which have been excluded as they were possible descriptions for their image
- plausible <- c(
- "rubber", # was a match for "statue", which could also be a small rubber
- "buck", # was a match for a picture of a book (homophone)
- "marrow", # was a match for pickle
- "logo", # just about anything could be a logo
- "kit" # was a match for a baseball cap
- )
- stim <- boss %>%
- filter(
- (is_pic == "yes" & dupe_pref) | is_pic=="no",
- modal_name %in% swow$cue,
- PK.Brysbaert >= 0.9,
- PoS.SUBTLEX_UK == "noun",
- CNC.Brysbaert >= 4,
- !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible)
- ) %>%
- mutate(is_pic = as.factor(is_pic)) %>%
- rename(string = "modal_name") %>%
- split_by(is_pic, "yes" ~ "no") %>%
- # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity
- # -1:-0.99 (similarity of between 0 and 0.01) gives suitably dissimilar
- control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>%
- control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>%
- control_for(Length, 0:0) %>%
- control_for(CNC.Brysbaert, -0.25:0.25) %>%
- control_for(ON.OLD20, -0.75:0.75) %>%
- control_for(Zipf.SUBTLEX_UK, -0.125:0.125) %>%
- control_for(BG.SUBTLEX_UK, -0.0025:0.0025) %>%
- generate(200, seed = stim_seed)
- write_csv(stim, "stim.csv")
- stim_lev_dists <- stim %>%
- rowwise() %>%
- mutate(levenshtein_distance = stringdist(A1, A2, method="lv")) %>%
- ungroup() %>%
- select(item_nr, levenshtein_distance)
- stim_tidy <- stim %>%
- long_format("all") %>%
- rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>%
- pivot_wider(id_cols = item_nr, names_from = condition, values_from = c(string, filename, perc_name_agree, Zipf.SUBTLEX_UK, Length, cos_ppmi_sim, CNC.Brysbaert, BG.SUBTLEX_UK, nb_diff_names, ON.OLD20)) %>%
- select(item_nr, string_A1, filename_A1, perc_name_agree_A1, string_A2, everything(), -filename_A2, -perc_name_agree_A2) %>%
- left_join(stim_lev_dists, by="item_nr")
- # optimise split for counterbalancing -------------------------------------
- message("Optimising counterbalance split")
- vars_to_match <- c("perc_name_agree_A1", "Zipf.SUBTLEX_UK_A1", "Zipf.SUBTLEX_UK_A2", "cos_ppmi_sim_A1", "cos_ppmi_sim_A2", "BG.SUBTLEX_UK_A1", "BG.SUBTLEX_UK_A2", "CNC.Brysbaert_A1", "CNC.Brysbaert_A2", "ON.OLD20_A1", "ON.OLD20_A2", "Length_A1", "Length_A2")
- # how many seeds to try
- max_seed <- 50000
- cl <- makeCluster(n_cores)
- cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)})
- clusterExport(cl, c("stim_tidy", "vars_to_match"))
- dist_overlaps <- parLapply(cl, 1:max_seed, function(seed_i) {
- cat(sprintf("%s \r", seed_i))
- set.seed(seed_i)
- order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE)
- stim_seed_i <- mutate(stim_tidy, order_grp = order_grp_vec)
- tibble(
- seed_i = seed_i,
- var_j = vars_to_match,
- ov = map_dbl(vars_to_match, function(var_j) {
- var_grp_1 <- stim_seed_i %>%
- filter(order_grp == 1) %>%
- pull(var_j) %>%
- as.numeric()
- var_grp_2 <- stim_seed_i %>%
- filter(order_grp == 2) %>%
- pull(var_j) %>%
- as.numeric()
- ov <- overlapping::overlap(list(var_grp_1, var_grp_2))
- ov$OV %>%
- unname() %>%
- as.numeric()
- })
- )
- }) %>%
- reduce(bind_rows)
- stopCluster(cl)
- dist_overlaps_summ <- dist_overlaps %>%
- group_by(seed_i) %>%
- summarise(sum = sum(ov), median = median(ov), mean = mean(ov), sd = sd(ov), min = min(ov)) %>%
- arrange(desc(mean), sd)
- # apply the split ---------------------------------------------------------
- dist_overlaps_summ %>%
- pull(seed_i) %>%
- first() %>%
- set.seed()
- order_grp_vec <- sample(rep(c(1, 2), 100), 200, replace = FALSE)
- stim_tidy <- mutate(stim_tidy, order_grp = order_grp_vec)
- write_csv(stim_tidy, "stim_tidy.csv")
- stim_tidy_long <- stim %>%
- long_format("all") %>%
- arrange(item_nr) %>%
- mutate(
- filename = ifelse(is.na(filename), lag(filename), filename),
- cos_ppmi_sim_tidy = as.numeric(ifelse(cos_ppmi_sim=="1", NA, cos_ppmi_sim))
- ) %>%
- rename(perc_name_agree = "perc_name_agree_denom_fq_inputs") %>%
- # also store the order group
- full_join(select(stim_tidy, item_nr, order_grp), by = "item_nr")
- write_csv(stim_tidy_long, "stim_tidy_long.csv")
- # select practice trials --------------------------------------------------
- message("Selecting practice trials")
- prac_seed <- 111
- set.seed(prac_seed)
- A1_trials <- sample(1:20, 10)
- practice_stim <- boss %>%
- filter(
- (is_pic == "yes" & dupe_pref) | is_pic=="no",
- modal_name %in% swow$cue,
- PK.Brysbaert >= 0.9,
- PoS.SUBTLEX_UK == "noun",
- CNC.Brysbaert >= 4,
- !modal_name %in% c(americanisms, inappropriate, unimageable, people, shortened, alternates, misspellings, incorrects, plurals, plausible),
- !modal_name %in% stim_tidy_long$string
- ) %>%
- mutate(is_pic = as.factor(is_pic)) %>%
- rename(string = "modal_name") %>%
- split_by(is_pic, "yes" ~ "no") %>%
- # -0.5:0 (similarity of between 0.5 and 1) seems to give high similarity
- # -1:-0.999 (similarity of between 0 and 0.001) gives suitably disimilar
- control_for_map(cos_sim, string, -1:-0.99, name = "cos_ppmi_sim") %>%
- control_for_map(maximise_lev_dist, string, 1:1, name = "max_lev_dist") %>%
- control_for(Length, 0:0) %>%
- generate(20, seed = prac_seed)
- practice_stim_long <- practice_stim %>%
- long_format("all") %>%
- mutate(filename = ifelse(is.na(filename), lag(filename), filename)) %>%
- # randomly allocate half the items to congruent condition, and half to incongruent, and only keep the relevant trials
- filter( (condition=="A1" & item_nr %in% A1_trials) | (condition=="A2" & !item_nr %in% A1_trials) )
- # save the practice trials
- write_csv(practice_stim_long, "practice_stim.csv")
- # tidy picture-word stimuli figures ---------------------------------------
- itemwise_vars_for_dens <- c("Length", "Zipf.SUBTLEX_UK", "BG.SUBTLEX_UK", "CNC.Brysbaert", "ON.OLD20")
- distwise_vars_for_dens <- c("perc_name_agree", "cos_ppmi_sim_tidy")
- # plt_cols <- c("#007EFF", "#FF007E")
- plt_cols <- c("#E69F00", "#56B4E9")
- counterbalanced_grps_label <- c("Set 1", "Set 2")
- fontsize <- 12
- # Format data
- plt_stim_itemwise <- stim_tidy_long %>%
- mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>%
- pivot_longer(cols=all_of(itemwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>%
- mutate(condition_j = as.numeric(factor(condition))) %>%
- rowwise() %>%
- # add horizontal jitter to points and lines
- mutate(., condition_j = ifelse(condition=="A1", condition_j + runif(1, 0.1, 0.15), condition_j + runif(1, -0.15, -0.1))) %>%
- # mutate(., condition_j = ifelse(condition=="A1", condition_j + 0.15, condition_j -0.15)) %>%
- ungroup() %>%
- mutate(
- condition = ifelse(condition=="A1", "Picture\nCongruent", "Picture\nIncongruent"),
- plt_variable = factor(plt_variable, levels = itemwise_vars_for_dens),
- plt_variable = fct_recode(
- plt_variable,
- `Length` = "Length",
- `Zipf Frequency` = "Zipf.SUBTLEX_UK",
- `Bigram Probability` = "BG.SUBTLEX_UK",
- `Concreteness` = "CNC.Brysbaert",
- `OLD20` = "ON.OLD20"
- )
- )
- itemwise_plt <- plt_stim_itemwise %>%
- ggplot(aes(condition, plt_value, colour=factor(order_grp), fill=factor(order_grp))) +
- geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nCongruent"), alpha=0.5, trim=FALSE, side="l", position=position_identity(), show.legend=FALSE) +
- geom_half_violin(data=filter(plt_stim_itemwise, condition=="Picture\nIncongruent"), alpha=0.5, trim=FALSE, side="r", position=position_identity(), show.legend=FALSE) +
- geom_point(aes(x=condition_j), alpha=0.75, shape=16, size=0.8) +
- geom_line(aes(x=condition_j, group=item_nr), alpha=0.3) +
- scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
- scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols, guide = guide_legend(override.aes = list(alpha = 1, size=2, linewidth=1))) +
- scale_y_continuous(breaks = scales::pretty_breaks(5)) +
- facet_wrap(vars(plt_variable), scales="free_y") +
- labs(
- x = "\nCongruency Condition\n",
- y = "Value",
- tag = "a"
- ) +
- theme_classic() +
- theme(
- legend.position = c(0.85, 0.15),
- text = element_text(size=fontsize),
- # plot.title = element_text(hjust=-0.1),
- strip.background = element_blank(),
- legend.margin = margin(),
- plot.margin = margin(3,3,3,8, unit="pt")
- )
- plt_stim_distwise <- stim_tidy_long %>%
- mutate(condition = sub("-", "\n", condition, fixed=TRUE)) %>%
- pivot_longer(cols=all_of(distwise_vars_for_dens), names_to="plt_variable", values_to="plt_value") %>%
- mutate(
- plt_variable = factor(plt_variable, levels = distwise_vars_for_dens),
- plt_variable = fct_recode(
- plt_variable,
- `Percentage of Name Agreement` = "perc_name_agree",
- `Cosine PPMI Semantic Similarity` = "cos_ppmi_sim_tidy"
- ),
- plt_variable = fct_rev(plt_variable)
- )
- custom_breaks <- function(x) { if (max(x) < 0.01) pretty(x, 3) else pretty(x, 5) }
- distwise_plt <- plt_stim_distwise %>%
- ggplot(aes(plt_value, colour=factor(order_grp), fill=factor(order_grp))) +
- geom_density(alpha=0.5, show.legend = FALSE) +
- facet_wrap(vars(plt_variable), scales="free") +
- labs(
- x = "Value",
- y = "Density",
- tag = "b"
- ) +
- scale_fill_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
- scale_colour_manual(name = "Stimulus Set", labels = counterbalanced_grps_label, values = plt_cols) +
- scale_x_continuous(breaks = custom_breaks) +
- scale_y_continuous(breaks = scales::pretty_breaks(3)) +
- theme_classic() +
- theme(
- legend.position = "none",
- text = element_text(size=fontsize),
- # plot.title = element_text(hjust=-0.1),
- strip.background = element_blank(),
- legend.margin = margin(),
- plot.margin = margin(3,3,3,8, unit="pt")
- )
- stim_summ <- (itemwise_plt / distwise_plt) +
- plot_layout(
- heights = c(5.5, 1), widths = 6.5,
- guides = "keep"
- )
- ggsave(file.path("fig", "stimuli_summary.png"), stim_summ, height=5.5, width=6.5, device="png", type="cairo")
- ggsave(file.path("fig", "stimuli_summary.pdf"), stim_summ, height=5.5, width=6.5, device="pdf")
- # select localiser stimuli ------------------------------------------------
- message("Selecting localiser stimuli")
- # misspelt words in the prevalence norms
- prev_misspellings <- c("yogurt")
- # words which might not be technically words
- nonwords <- c("yippee")
- max_seed <- 500000
- set.seed(stim_seed)
- loc_seeds <- sample(1:2147483647, max_seed, replace=FALSE)
- match_vars <- c("Zipf.SUBTLEX_UK", "PREV.Brysbaert", "CNC.Brysbaert", "AoA.Kuperman", "VAL.Warriner", "AROU.Warriner", "DOM.Warriner", "ON.OLD20", "BG.SUBTLEX_UK", "Length", "RT.BLP", "Accuracy.BLP")
- loc_stim_pool <- lexops %>%
- filter(
- PK.Brysbaert >= 0.9,
- !is.na(Zipf.SUBTLEX_UK),
- !string %in% c(prev_misspellings, nonwords),
- ) %>%
- mutate(PoS.SUBTLEX_UK = as.character(PoS.SUBTLEX_UK)) %>%
- select(string, all_of(match_vars), PoS.SUBTLEX_UK)
- pos_cats <- unique(loc_stim_pool$PoS.SUBTLEX_UK)
- pos_match_vars <- sapply(pos_cats, function(x) sprintf("pos_%s", x))
- pos_dum_vals <- map_dfc(pos_cats, function(x) {
- as.numeric(loc_stim_pool$PoS.SUBTLEX_UK == x)
- }) %>%
- set_names(pos_match_vars)
- loc_stim_pool <- bind_cols(loc_stim_pool, pos_dum_vals)
- cl <- makeCluster(n_cores)
- cl_packages <- clusterEvalQ(cl, {library(dplyr); library(purrr); library(overlapping)})
- clusterExport(cl, c("loc_stim_pool", "match_vars", "pos_match_vars", "stim_tidy_long", "practice_stim_long", "loc_seeds"))
- loc_seeds_res <- parLapply(cl, 1:max_seed, function(seed_i) {
- set.seed(loc_seeds[seed_i])
- loc_stim_i <- loc_stim_pool %>%
- filter(
- !string %in% stim_tidy_long$string,
- !string %in% practice_stim_long$string
- ) %>%
- slice_sample(n = 100)
- ov_numeric <- map_dbl(match_vars, function(var_j) {
- pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])]
- sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])]
- ov <- overlapping::overlap(list(pool_vals, sample_vals))
- ov$OV %>%
- unname() %>%
- as.numeric()
- })
- ov_pos <- map_dbl(pos_match_vars, function(var_j) {
- pool_vals <- loc_stim_pool[[var_j]][!is.na(loc_stim_pool[[var_j]])]
- sample_vals <- loc_stim_i[[var_j]][!is.na(loc_stim_i[[var_j]])]
- ov <- overlapping::overlap(list(pool_vals, sample_vals))
- ov$OV %>%
- unname() %>%
- as.numeric()
- })
- bind_rows(
- tibble(
- seed = loc_seeds[seed_i],
- ov = ov_numeric,
- vars = match_vars,
- weight = 1
- ),
- tibble(
- seed = loc_seeds[seed_i],
- ov = ov_pos,
- vars = pos_match_vars,
- weight = 1/8
- )
- ) %>%
- mutate(
- weight = case_when(
- vars == "Length" ~ 3,
- vars == "Zipf.SUBTLEX_UK" ~ 2,
- TRUE ~ weight
- ),
- ov_w = ov * weight
- ) %>%
- group_by(seed) %>%
- summarise(ov_sum = sum(ov_w))
- }) %>%
- reduce(bind_rows)
- stopCluster(cl)
- best_seed <- loc_seeds_res %>%
- arrange(desc(ov_sum)) %>%
- top_n(1) %>%
- pull(seed)
- # recreate the best stimulus set
- set.seed(best_seed)
- loc_stim <- loc_stim_pool %>%
- filter(
- !string %in% stim_tidy_long$string,
- !string %in% practice_stim_long$string
- ) %>%
- slice_sample(n = 100)
- loc_stim_tidy <- loc_stim %>%
- rename(word=string) %>%
- mutate(
- item_nr = row_number()
- ) %>%
- select(item_nr, everything())
- write_csv(loc_stim, "localiser_stim.csv")
- # plot the distributions for localiser stim -------------------------------
- repres_cols <- c("#696867", "#5ba7d6")
- dens_pl <- bind_rows(
- mutate(loc_stim, type="Sample"),
- mutate(loc_stim_pool, type="Population")
- ) %>%
- pivot_longer(cols=c(match_vars[match_vars!="Length"]), names_to="var", values_to="value") %>%
- mutate(var_tidy = recode(
- var,
- Zipf.SUBTLEX_UK = "Zipf Frequency",
- BG.SUBTLEX_UK = "Bigram Probability",
- CNC.Brysbaert = "Concreteness",
- ON.OLD20 = "OLD20",
- VAL.Warriner = "Valence",
- AROU.Warriner = "Arousal",
- DOM.Warriner = "Dominance",
- AoA.Kuperman = "Age of Acquisition",
- PREV.Brysbaert = "Prevalence",
- RT.BLP = "Lexical Decision RT",
- Accuracy.BLP = "Lexical Decision Accuracy"
- )) %>%
- mutate(var_tidy = factor(var_tidy, levels=sort(unique(var_tidy)))) %>%
- ggplot(aes(value, fill=type, alpha=type)) +
- geom_density() +
- facet_wrap(vars(var_tidy), scales="free", nrow=4) +
- labs(
- x = "Value",
- y = "Density",
- tag = "a",
- fill = NULL
- ) +
- scale_fill_manual(values=repres_cols) +
- scale_alpha_manual(values=c(1, 0.5), guide=FALSE) +
- scale_x_continuous(guide = guide_axis(check.overlap = TRUE)) +
- theme_classic() +
- theme(
- legend.position = c(0.85, 0.1),
- axis.text.y=element_blank(),
- axis.ticks.y=element_blank(),
- strip.background = element_blank(),
- legend.margin = margin(),
- plot.margin = margin(3,3,3,3, unit="pt")
- )
- dat_length <- bind_rows(
- mutate(loc_stim, type="Sample"),
- mutate(loc_stim_pool, type="Population")
- ) %>%
- mutate(var="Length", value=as.numeric(Length))
- dat_pos <- bind_rows(
- loc_stim %>% mutate(type="Sample", max_n=nrow(.)),
- loc_stim_pool %>% mutate(type="Population", max_n=nrow(.))
- ) %>%
- mutate(var="Part of Speech", value=PoS.SUBTLEX_UK) %>%
- group_by(type, value, var) %>%
- summarise(prop = n()/max_n) %>%
- distinct() %>%
- filter(value %in% loc_stim$PoS.SUBTLEX_UK)
- length_pl <- dat_length %>%
- ggplot(aes(value)) +
- geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[1]], alpha=1, data=filter(dat_length, type=="Population")) +
- geom_histogram(aes(y = after_stat(count / sum(count))), position = "identity", binwidth=1, fill=repres_cols[[2]], alpha=0.5, data=filter(dat_length, type=="Sample")) +
- facet_wrap(vars(var), scales="free") +
- labs(
- x = "",
- y = "Proportion",
- tag = "b"
- ) +
- theme_classic() +
- theme(
- legend.position = "none",
- legend.margin = margin(),
- plot.margin = margin(3,3,3,3, unit="pt"),
- strip.background = element_blank()
- )
- pos_pl <- dat_pos %>%
- ggplot(aes(value, prop)) +
- geom_col(position = "identity", fill=repres_cols[[1]], alpha=1, data=filter(dat_pos, type=="Population")) +
- geom_col(position = "identity", fill=repres_cols[[2]], alpha=0.5, data=filter(dat_pos, type=="Sample")) +
- facet_wrap(vars(var), scales="free") +
- labs(
- x = "",
- y = NULL
- ) +
- theme_classic() +
- theme(
- legend.position = "none",
- legend.margin = margin(),
- plot.margin = margin(3,3,3,3, unit="pt"),
- strip.background = element_blank()
- )
- # loc_stim_summ <- (dens_pl / (length_pl | pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=-0.175)))) +
- # plot_layout(heights = c(5.8, 1.2), widths = c(6.5, 2.5, 4))
- loc_stim_summ <- dens_pl + length_pl + (pos_pl + xlab("Value") + theme(axis.title.x = element_text(hjust=0.18))) +
- plot_layout(
- heights = c(6, 1, 1),
- design = "
- AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
- BBBBBBBBBBCCCCCCCCCCCCCCCCCCCC"
- )
- # loc_stim_summ
- ggsave(file.path("fig", "localiser_stimuli_summary.png"), loc_stim_summ, height=5.5, width=6.5, device="png", type="cairo")
- ggsave(file.path("fig", "localiser_stimuli_summary.pdf"), loc_stim_summ, height=5.5, width=6.5, device="pdf")
- message("Stimuli done!")
|