channel = read_csv("gun_channels_labeled.csv") %>% as.data.frame
video = read_csv("gun_videos.csv") %>% as.data.frame
docs = "gun_transcripts/*"
transcript = readtext(docs)
#urn to corpus
corp_tran = corpus(transcript)
video.ids = gsub("\\.txt", "", docnames(corp_tran))
video.data = data.frame(video.id = video.ids)
video.data = left_join(video.data, video, by = c("video.id" = "rec.video.id"))
#summary(corp_tran, 10) #take a look at the corpus
#preferred preprocessing method: PNLSWI
toke_tran = tokens(corp_tran, verbose = TRUE) #77215
toke_tran_P = tokens(corp_tran, remove_punct = TRUE, verbose = TRUE) # 77166
toke_tran_PN = tokens(corp_tran, remove_punct = TRUE, remove_numbers = TRUE, verbose = TRUE) #74664
dfm_tran_raw = dfm(toke_tran, tolower = FALSE, verbose = TRUE) #77211?
dfm_tran_P = dfm(toke_tran_P, tolower = FALSE, verbose = TRUE) #77163
dfm_tran_PN = dfm(toke_tran_PN, tolower = FALSE, verbose = TRUE) #74661
dfm_tran_PNL = dfm(dfm_tran_PN, tolower = TRUE, verbose = TRUE) #56582
dfm_tran_PNLS = dfm(dfm_tran_PNL, tolower = FALSE, stem = TRUE, verbose = TRUE) #37175