this repo has no description

Merge branch 'main' of github.com:sona-tau/underrepresentation-theory

+345 -38
code/All Tags.jpeg

This is a binary file and will not be displayed.

code/MC_Tag_df.rda

This is a binary file and will not be displayed.

code/MS_Tag_df.rda

This is a binary file and will not be displayed.

code/MultiSelect.jpeg

This is a binary file and will not be displayed.

+345 -38
code/Survey Question Conversion and Analysis.R
··· 7 7 install.packages("dplyr") 8 8 install.packages("SnowballC") 9 9 install.packages("tibble") 10 + install.packages("tidyr") 11 + install.packages("patchwork") 12 + library(patchwork) 10 13 library(ggplot2) 11 14 library(tidyverse) 12 15 library(tidytext) ··· 15 18 library(dplyr) 16 19 library(SnowballC) 17 20 library(tibble) 21 + library(tidyr) 18 22 py2r <- function(df, rows) { 19 23 for (row in rows) { 20 24 # In this case we use reticulate::py_eval to convert a python string ··· 197 201 ams_ms_tagged <- as.data.frame(AMS_multipleselect_tags) 198 202 199 203 AMS_Question_Tags <- data.frame( 200 - questions = AMS_questions_list, 201 - tags = tagged_AMS_df 204 + Questions = AMS_questions_list, 205 + Tags = tagged_AMS_df 202 206 ) 203 207 208 + colnames(AMS_Question_Tags) <- list("Questions", "Tags") 209 + 204 210 AMS_MC_Tags <- data.frame( 205 - questions = AMS_MC, 206 - tags = ams_mc_tagged 211 + Questions = AMS_MC, 212 + Tags = ams_mc_tagged 207 213 ) 214 + 215 + colnames(AMS_MC_Tags) <- list("Questions", "Tags") 208 216 209 217 AMS_MS_Tags <- data.frame( 210 - questions = AMS_MS, 211 - tags = ams_ms_tagged 218 + Questions = AMS_MS, 219 + Tags = ams_ms_tagged 212 220 ) 213 221 222 + colnames(AMS_MS_Tags) <- list("Questions", "Tags") 223 + 214 224 AMS_TI_Tags <- data.frame( 215 - questions = AMS_TI, 216 - tags = ams_ti_tagged 225 + Questions = AMS_TI, 226 + Tags = ams_ti_tagged 217 227 ) 218 228 229 + colnames(AMS_TI_Tags) <- list("Questions", "Tags") 230 + 219 231 AMS_WR_Tags <- data.frame ( 220 - questions = AMS_WR, 221 - tags = ams_wr_tagged 232 + Questions = AMS_WR, 233 + Tags = ams_wr_tagged 222 234 ) 223 235 236 + colnames(AMS_WR_Tags) <- list("Questions", "Tags") 224 237 225 238 CBMS_all_questions_char <- as.character(CBMS_all_questions) 226 239 ··· 287 300 tags = tagged_CBMS_df 288 301 ) 289 302 303 + colnames(CBMS_Question_Tags) <- list("Questions", "Tags") 304 + 290 305 CBMS_MC_Tags <- data.frame( 291 - questions = CBMSMC_full, 292 - tags = CBMS_mc_tagged 306 + Questions = CBMSMC_full, 307 + Tags = CBMS_mc_tagged 293 308 ) 294 309 all_multiselect_CBMS <- as.data.frame(CBMS_all_multipleselect) 295 310 296 311 class(CBMS_all_multipleselect) 297 312 298 313 CBMS_MS_Tags <- data.frame( 299 - questions = CBMS_all_multipleselect, 300 - tags = tagged_cbms_ms$CBMS_ms_tagged 314 + Questions = CBMS_all_multipleselect, 315 + Tags = tagged_cbms_ms$CBMS_ms_tagged 301 316 ) 302 317 303 318 rownames(CBMS_MS_Tags) <- NULL 304 319 305 320 CBMS_TI_Tags <- data.frame( 306 - questions = CBMS_all_tableinput, 307 - tags = tagged_cbms_ti$CBMS_ti_tagged 321 + Questions = CBMS_all_tableinput, 322 + Tags = tagged_cbms_ti$CBMS_ti_tagged 308 323 ) 309 324 310 325 rownames(CBMS_TI_Tags) <- NULL 311 326 312 327 CBMS_WR_Tags <- data.frame( 313 - questions = CBMS_all_writtenresponse, 314 - tags = tagged_cbms_wr$CBMS_wr_tagged 328 + Questions = CBMS_all_writtenresponse, 329 + Tags = tagged_cbms_wr$CBMS_wr_tagged 315 330 ) 316 331 317 - rownames(CBMS_TI_Tags) <- NULL 332 + rownames(CBMS_WR_Tags) <- NULL 318 333 319 334 tagged_IPEDS <- sapply(IPEDS_all_questions, function(text) { 320 335 matches <- str_extract_all(text, paste(keywords, collapse = "|"))[[1]] ··· 328 343 tagged_IPEDS_df <- as.data.frame(tagged_IPEDS) 329 344 330 345 346 + 331 347 #there were no tagged questions in the IPEDS survey, so I assigned each question type a value of zero. 332 - IPEDS_multiplechoice_tagged = 0 333 - IPEDS_multipleslect_tagged = 0 334 - IPEDS_tableinput_tagged = 0 335 - IPEDS_writtenresponse_tagged = 0 348 + IPEDS_multiplechoice_tagged <- sapply(IPEDS_MC, function(text) { 349 + matches <- str_extract_all(text, paste(keywords, collapse = "|"))[[1]] 350 + if (length(matches) > 0) { 351 + paste("Tags:", paste(unique(matches), collapse = ", ")) 352 + } else { 353 + NA 354 + } 355 + }) 336 356 337 - tagged_AMS_count <- tagged_AMS_df[complete.cases(tagged_AMS_df),] |> length() 338 - tagged_ams_mc_count <- ams_mc_tagged[complete.cases(ams_mc_tagged),] |> length() 339 - tagged_ams_ms_count <- ams_ms_tagged[complete.cases(ams_ms_tagged),] |> length() 340 - tagged_ams_ti_count <- ams_ti_tagged[complete.cases(ams_ti_tagged),] |> length() 341 - tagged_ams_wr_count <- ams_wr_tagged[complete.cases(ams_wr_tagged),] |> length() 357 + tagged_IPEDS_MC <- as.data.frame(IPEDS_multiplechoice_tagged) 358 + 359 + IPEDS_multipleslect_tagged <- sapply(IPEDS_MS, function(text) { 360 + matches <- str_extract_all(text, paste(keywords, collapse = "|"))[[1]] 361 + if (length(matches) > 0) { 362 + paste("Tags:", paste(unique(matches), collapse = ", ")) 363 + } else { 364 + NA 365 + } 366 + }) 367 + 368 + tagged_IPEDS_MS <- as.data.frame(IPEDS_multipleslect_tagged) 369 + rownames(tagged_IPEDS_MS) <- NULL 370 + 371 + 372 + IPEDS_tableinput_tagged <- sapply(IPEDS_TI, function(text) { 373 + matches <- str_extract_all(text, paste(keywords, collapse = "|"))[[1]] 374 + if (length(matches) > 0) { 375 + paste("Tags:", paste(unique(matches), collapse = ", ")) 376 + } else { 377 + NA 378 + } 379 + }) 380 + 381 + tagged_IPEDS_TI <- as.data.frame(IPEDS_tableinput_tagged) 382 + 383 + IPEDS_writtenresponse_tagged <- sapply(IPEDS_WR, function(text) { 384 + matches <- str_extract_all(text, paste(keywords, collapse = "|"))[[1]] 385 + if (length(matches) > 0) { 386 + paste("Tags:", paste(unique(matches), collapse = ", ")) 387 + } else { 388 + NA 389 + } 390 + }) 391 + 392 + tagged_IPEDS_WR <- as.data.frame(IPEDS_writtenresponse_tagged) 393 + 394 + IPEDS_Question_Tags <- data.frame( 395 + Questions = IPEDS_all_questions, 396 + Tags = tagged_IPEDS_df$tagged_IPEDS 397 + ) 398 + 399 + rownames(IPEDS_Question_Tags) <- NULL 400 + 401 + IPEDS_MC_Tags <- data.frame( 402 + Questions = IPEDS_MC, 403 + Tags = tagged_IPEDS_MC$IPEDS_multiplechoice_tagged 404 + ) 405 + 406 + rownames(IPEDS_MC_Tags) <- NULL 407 + 408 + IPEDS_MS_Tags <- data.frame( 409 + Questions = IPEDS_MS, 410 + Tags = tagged_IPEDS_MS$IPEDS_multipleslect_tagged 411 + ) 412 + 413 + rownames(IPEDS_MS_Tags) <- NULL 414 + 415 + IPEDS_TI_Tags <- data.frame( 416 + Questions = IPEDS_TI, 417 + Tags = tagged_IPEDS_TI$IPEDS_tableinput_tagged 418 + ) 419 + 420 + rownames(IPEDS_TI_Tags) <- NULL 421 + 422 + IPEDS_WR_Tags <- data.frame( 423 + Questions = IPEDS_WR, 424 + Tags = tagged_IPEDS_WR$IPEDS_writtenresponse_tagged 425 + ) 426 + 427 + rownames(IPEDS_WR_Tags) <- NULL 428 + 429 + AMS_Question_Tags$Source <- "AMS" 430 + CBMS_Question_Tags$Source <- "CBMS" 431 + IPEDS_Question_Tags$Source <- "IPEDS" 432 + 433 + big_question_tag_df <- rbind(AMS_Question_Tags, CBMS_Question_Tags, IPEDS_Question_Tags) 434 + 435 + #Counting number of each question in big_question_tag_df 436 + #Counting AMS 437 + AMS_Sources_Counts<- big_question_tag_df|> 438 + filter(Source == "AMS", !is.na(Tags))|> 439 + nrow() 440 + AMS_Sources_Counts 441 + 442 + #Counting CMS 443 + CBMS_Sources_Counts<- big_question_tag_df|> 444 + filter(Source == "CBMS", !is.na(Tags)) |> 445 + nrow() 446 + CBMS_Sources_Counts 447 + 448 + #Counting IPEDS 449 + IPEDS_Sources_Counts<-big_question_tag_df|> 450 + filter (Source == "IPEDS", !is.na(Tags)) |> 451 + nrow() 452 + IPEDS_Sources_Counts 453 + 454 + Counts_of_Tagged_Sources <- data.frame( 455 + AMS = AMS_Sources_Counts, 456 + IPEDS = IPEDS_Sources_Counts, 457 + CBMS = CBMS_Sources_Counts) 458 + 459 + Counts_of_Tagged_Sources_long <- pivot_longer( 460 + as_tibble(Counts_of_Tagged_Sources), 461 + cols = everything(), 462 + names_to = "Source", 463 + values_to = "Number Tagged" 464 + ) 465 + 466 + All_Tagged_Graph <- ggplot(Counts_of_Tagged_Sources_long, aes(x = Source, y = `Number Tagged`, fill = Source)) + 467 + geom_col() + 468 + scale_y_continuous(breaks = seq(0, max(Counts_of_Tagged_Sources_long$`Number Tagged`), by = 1)) + 469 + labs(y = NULL, 470 + x = NULL, 471 + title = "ALL") 472 + 473 + All_Tagged_Graph 474 + 475 + AMS_MC_Tags$Source = "AMS" 476 + CBMS_MC_Tags$Source = "CBMS" 477 + IPEDS_MC_Tags$Source = "IPEDS" 342 478 343 - tagged_CBMS_count <- tagged_CBMS_df[complete.cases(tagged_CBMS_df),] |> length() 344 - tagged_cbms_mc_count <- tagged_cbms_mc[complete.cases(tagged_cbms_mc),] |> length() 345 - tagged_cbms_ms_count <- tagged_cbms_ms[complete.cases(tagged_cbms_ms),] |> length() 346 - tagged_cbms_wr_count <-tagged_cbms_wr[complete.cases(tagged_cbms_wr),] |> length() 347 - tagged_cbms_ti <- tagged_cbms_ti[complete.cases(tagged_cbms_ti),] |> length() 479 + MC_Tag_df <- rbind(AMS_MC_Tags, CBMS_MC_Tags, IPEDS_MC_Tags) 348 480 349 481 350 - tagged_IPEDS_count <- tagged_IPEDS_df[complete.cases(tagged_IPEDS_df),] |> length() 482 + AMS_MC_Sources_Counts<- MC_Tag_df|> 483 + filter(Source == "AMS", !is.na(Tags))|> 484 + nrow() 485 + AMS_MC_Sources_Counts 351 486 352 - ggplot 487 + #Counting CMS 488 + CBMS_MC_Sources_Counts<- MC_Tag_df|> 489 + filter(Source == "CBMS", !is.na(Tags)) |> 490 + nrow() 491 + CBMS_MC_Sources_Counts 353 492 493 + #Counting IPEDS 494 + IPEDS_MC_Sources_Counts<- MC_Tag_df|> 495 + filter (Source == "IPEDS", !is.na(Tags)) |> 496 + nrow() 497 + IPEDS_MC_Sources_Counts 354 498 499 + Counts_of_Tagged_Sources_MC <- data.frame( 500 + AMS = AMS_MC_Sources_Counts, 501 + IPEDS = IPEDS_MC_Sources_Counts, 502 + CBMS = CBMS_MC_Sources_Counts) 355 503 504 + Counts_of_Tagged_Sources_MC_long <- pivot_longer( 505 + as_tibble(Counts_of_Tagged_Sources_MC), 506 + cols = everything(), 507 + names_to = "Source", 508 + values_to = "Number Tagged" 509 + ) 356 510 511 + MC_Tagged_Graph <- ggplot(Counts_of_Tagged_Sources_MC_long, aes(x = Source, y = `Number Tagged`, fill = Source)) + 512 + geom_col() + 513 + scale_y_continuous(breaks = seq(0, max(Counts_of_Tagged_Sources_long$`Number Tagged`), by = 1)) + 514 + labs(y = NULL, 515 + x = NULL, 516 + title = "Multiple Choice") 357 517 358 - 359 - 518 + MC_Tagged_Graph 519 + 520 + 521 + AMS_MS_Tags$Source = "AMS" 522 + CBMS_MS_Tags$Source = "CBMS" 523 + IPEDS_MS_Tags$Source = "IPEDS" 524 + 525 + MS_Tag_df <- rbind(AMS_MS_Tags, CBMS_MS_Tags, IPEDS_MS_Tags) 526 + 527 + 528 + AMS_MS_Sources_Counts<- MS_Tag_df|> 529 + filter(Source == "AMS", !is.na(Tags))|> 530 + nrow() 531 + AMS_MS_Sources_Counts 532 + 533 + #Counting CMS 534 + CBMS_MS_Sources_Counts<- MS_Tag_df|> 535 + filter(Source == "CBMS", !is.na(Tags)) |> 536 + nrow() 537 + CBMS_MS_Sources_Counts 538 + 539 + #Counting IPEDS 540 + IPEDS_MS_Sources_Counts<- MS_Tag_df|> 541 + filter (Source == "IPEDS", !is.na(Tags)) |> 542 + nrow() 543 + IPEDS_MS_Sources_Counts 544 + 545 + Counts_of_Tagged_Sources_MS <- data.frame( 546 + AMS = AMS_MS_Sources_Counts, 547 + IPEDS = IPEDS_MS_Sources_Counts, 548 + CBMS = CBMS_MS_Sources_Counts) 549 + 550 + Counts_of_Tagged_Sources_MS_long <- pivot_longer( 551 + as_tibble(Counts_of_Tagged_Sources_MS), 552 + cols = everything(), 553 + names_to = "Source", 554 + values_to = "Number Tagged" 555 + ) 556 + 557 + MS_Tagged_Graph <- ggplot(Counts_of_Tagged_Sources_MS_long, aes(x = Source, y = `Number Tagged`, fill = Source)) + 558 + geom_col() + 559 + scale_y_continuous(breaks = seq(0, max(Counts_of_Tagged_Sources_long$`Number Tagged`), by = 1)) + 560 + labs(y = NULL, 561 + x = NULL, 562 + title = "Multiple Select") 563 + 564 + MS_Tagged_Graph 565 + 566 + 567 + AMS_TI_Tags$Source = "AMS" 568 + CBMS_TI_Tags$Source = "CBMS" 569 + IPEDS_TI_Tags$Source = "IPEDS" 570 + 571 + TI_Tag_df <- rbind(AMS_TI_Tags, CBMS_TI_Tags, IPEDS_TI_Tags) 572 + 573 + 574 + AMS_TI_Sources_Counts<- TI_Tag_df|> 575 + filter(Source == "AMS", !is.na(Tags))|> 576 + nrow() 577 + AMS_TI_Sources_Counts 578 + 579 + #Counting CMS 580 + CBMS_TI_Sources_Counts<- TI_Tag_df|> 581 + filter(Source == "CBMS", !is.na(Tags)) |> 582 + nrow() 583 + CBMS_TI_Sources_Counts 584 + 585 + #Counting IPEDS 586 + IPEDS_TI_Sources_Counts<- TI_Tag_df|> 587 + filter (Source == "IPEDS", !is.na(Tags)) |> 588 + nrow() 589 + IPEDS_TI_Sources_Counts 590 + 591 + Counts_of_Tagged_Sources_TI <- data.frame( 592 + AMS = AMS_TI_Sources_Counts, 593 + IPEDS = IPEDS_TI_Sources_Counts, 594 + CBMS = CBMS_TI_Sources_Counts) 595 + 596 + Counts_of_Tagged_Sources_TI_long <- pivot_longer( 597 + as_tibble(Counts_of_Tagged_Sources_TI), 598 + cols = everything(), 599 + names_to = "Source", 600 + values_to = "Number Tagged" 601 + ) 602 + 603 + TI_Tagged_Graph <- ggplot(Counts_of_Tagged_Sources_TI_long, aes(x = Source, y = `Number Tagged`, fill = Source)) + 604 + geom_col() + 605 + scale_y_continuous(breaks = seq(0, max(Counts_of_Tagged_Sources_long$`Number Tagged`), by = 1)) + 606 + labs(y = NULL, 607 + x = NULL, 608 + title = "Table Input") 609 + 610 + TI_Tagged_Graph 611 + 612 + AMS_WR_Tags$Source = "AMS" 613 + CBMS_WR_Tags$Source = "CBMS" 614 + IPEDS_WR_Tags$Source = "IPEDS" 615 + 616 + WR_Tag_df <- rbind(AMS_WR_Tags, CBMS_WR_Tags, IPEDS_WR_Tags) 617 + 618 + 619 + AMS_WR_Sources_Counts<- WR_Tag_df|> 620 + filter(Source == "AMS", !is.na(Tags))|> 621 + nrow() 622 + AMS_WR_Sources_Counts 623 + 624 + #Counting CMS 625 + CBMS_WR_Sources_Counts<- WR_Tag_df|> 626 + filter(Source == "CBMS", !is.na(Tags)) |> 627 + nrow() 628 + CBMS_WR_Sources_Counts 629 + 630 + #Counting IPEDS 631 + IPEDS_WR_Sources_Counts<- WR_Tag_df|> 632 + filter (Source == "IPEDS", !is.na(Tags)) |> 633 + nrow() 634 + IPEDS_WR_Sources_Counts 635 + 636 + Counts_of_Tagged_Sources_WR <- data.frame( 637 + AMS = AMS_WR_Sources_Counts, 638 + IPEDS = IPEDS_WR_Sources_Counts, 639 + CBMS = CBMS_WR_Sources_Counts) 640 + 641 + Counts_of_Tagged_Sources_WR_long <- pivot_longer( 642 + as_tibble(Counts_of_Tagged_Sources_WR), 643 + cols = everything(), 644 + names_to = "Source", 645 + values_to = "Number Tagged" 646 + ) 647 + 648 + WR_Tagged_Graph <- ggplot(Counts_of_Tagged_Sources_WR_long, aes(x = Source, y = `Number Tagged`, fill = Source)) + 649 + geom_col() + 650 + scale_y_continuous(breaks = seq(0, max(Counts_of_Tagged_Sources_long$`Number Tagged`), by = 1)) + 651 + labs(y = NULL, 652 + x = NULL, 653 + title = "Written Response") 654 + WR_Tagged_Graph 655 + 656 + Big_Graph <- All_Tagged_Graph + 657 + MC_Tagged_Graph + 658 + MS_Tagged_Graph + 659 + TI_Tagged_Graph + 660 + WR_Tagged_Graph + 661 + plot_layout(nrow = 1) 360 662 361 - 663 + Big_Graph 362 664 665 + save(big_question_tag_df, file = "big_question_tag_df.rda") 666 + save(MC_Tag_df, file = "MC_Tag_df.rda") 667 + save(MS_Tag_df, file = "MS_Tag_df.rda") 668 + save(TI_Tag_df, file = "TI_Tag_df.rda") 669 + save(WR_Tag_df, file = "WR_Tag_df.rda") 363 670 364 671 365 672
code/TI_Tag_df.rda

This is a binary file and will not be displayed.

code/WR_Tag_df.rda

This is a binary file and will not be displayed.

code/big_question_tag_df.rda

This is a binary file and will not be displayed.

code/interactive_wordcloud.R

This is a binary file and will not be displayed.