#Problem 1
In this problem set, we will conduct supervised classification and unsupervised dimension reduction using a corpus of YouTube video transcripts about guns and gun control. The cleaned transcripts and associated metadata are available in gun transcripts.zip and gun videos.csv, respectively. Key variable definitions in the metadata are as follows:
Problem 1 is intended to be short. Begin by loading the data and constructing a document-term matrix (DTM). You may preprocess the data in any way that you see fit.
Load the data and construct a DTM
Load the data
channel = read_csv("gun_channels_labeled.csv") %>% as.data.frame
video = read_csv("gun_videos.csv") %>% as.data.frame
docs = "gun_transcripts/*"
transcript = readtext(docs)
Turn to DTM, use P-N-L-S-W-I, remove terms that appear in less than 0.5% of documents
#turn to corpus
corp_tran = corpus(transcript)
video.ids = gsub("\\.txt", "", docnames(corp_tran))
video.data = data.frame(video.id = video.ids)
video.data = left_join(video.data, video, by = c("video.id" = "rec.video.id"))
#summary(corp_tran, 10) #take a look at the corpus
#preferred preprocessing method: PNLSWI
toke_tran = tokens(corp_tran, verbose = TRUE) #77215
toke_tran_P = tokens(corp_tran, remove_punct = TRUE, verbose = TRUE) # 77166
toke_tran_PN = tokens(corp_tran, remove_punct = TRUE, remove_numbers = TRUE, verbose = TRUE) #74664
dfm_tran_raw = dfm(toke_tran, tolower = FALSE, verbose = TRUE) #77211?
dfm_tran_P = dfm(toke_tran_P, tolower = FALSE, verbose = TRUE) #77163
dfm_tran_PN = dfm(toke_tran_PN, tolower = FALSE, verbose = TRUE) #74661
dfm_tran_PNL = dfm(dfm_tran_PN, tolower = TRUE, verbose = TRUE) #56582
dfm_tran_PNLS = dfm(dfm_tran_PNL, tolower = FALSE, stem = TRUE, verbose = TRUE) #37175
## Warning: 'stem' is deprecated; use dfm_wordstem() instead
dfm_tran_PNLSW = dfm(dfm_tran_PNLS, tolower = FALSE, remove = stopwords("en"), verbose = TRUE) #37043
## Warning: 'remove' is deprecated; use dfm_remove() instead
dfm_tran_PNLSWI = dfm_trim(dfm_tran_PNLSW, min_docfreq = 0.5/100*ndoc(corp_tran), verbose = TRUE) #31328 removed, remove terms appear in less than 0.5% of documents
dfm_tran = dfm_tran_PNLSWI #will use this one for later questions
#word.perdoc = rowSums(dfm_tran)
#doc.perword = colSums(dfm_tran)
Convert the frequency count to a proportion within documents
dfm_tran_prop = dfm_weight(dfm_tran, scheme = "prop")
#dfm_tran_prop[1:5, 1:10] #take a look
Alternative preprocessing decisions: remove terms that appear in less than 1% of documents
#alternative preprocessing method: PNLSWI, remove 1% instead of 0.5%
dfm_tran_alt = dfm_trim(dfm_tran_PNLSW, min_docfreq = 1/100*ndoc(corp_tran),
verbose = TRUE) #33133 removed
dfm_tran_alt_prop = dfm_weight(dfm_tran_alt, scheme="prop")
Briefly discuss the difference
Compared with the main preprocessing decisions, the “alternative” regime
has fewer terms because the threshold of removing infrequent words is
higher.
#Problem 2
Setup
x = dfm_tran_prop #the DTM to be used
y = ifelse(video.data$label == "pro-gun", 1,
ifelse(video.data$label == "anti-gun", 0, NA)) #drop undetermined
obj.use = !is.na(y) #only labelled
x = dfm_tran[obj.use,] #only use labelled videos
y = y[obj.use]
fold = video.data[obj.use,"fold"] #fold indicator
Function to compute true positive rate, true negative rate, and accuracy
evaluation = function(ypred_y){
ypred = ypred_y[,1] %>% factor(levels=0:1)
yactual = ypred_y[,2] %>% factor(levels=0:1)
tab = table(ypred, yactual)
accuracy = sum(diag(tab)) / sum(tab)
confusion = table(ypred, yactual)
tpr = confusion[2,2] / sum(confusion[,2])
tnr = confusion[1,1] / sum(confusion[,1])
eva.result = c(tpr, tnr, accuracy)
names(eva.result) = c("True Positive Rate", "True Negative Rate", "Accuracy")
return(eva.result)
}
Algorithm 1: LASSO, \(\lambda \in \{0.01, 1, 10\}\)
#Define a function to run LASSO and returns predicted value + actual value
run.lasso = function(fold.test, lam, sample){
train = which(fold!=fold.test)
test = which(fold==fold.test)
lasso = glmnet(x[train,], y[train],
family="binomial", alpha=1, lambda=lam)
if (sample == "in"){
yactual = y[train]
preds = predict(lasso, x[train,], type="class")
} else if (sample == "out"){
yactual = y[test]
preds = predict(lasso, x[test,], type="class")
}
return(cbind(preds, yactual))
}
#Run Lasso
eval.lasso.k4 = NA
for (l in c(0.001, 1, 100)){
result = run.lasso(fold.test=4, lam=l, sample="in")
eval = evaluation(result)
eval.lasso.k4 = rbind(eval.lasso.k4, eval)
}
for (l in c(0.001, 1, 100)){
result = run.lasso(fold.test=4, lam=l, sample="out")
eval = evaluation(result)
eval.lasso.k4 = rbind(eval.lasso.k4, eval)
}
eval.lasso.k4 = eval.lasso.k4[-1,] %>% as.data.frame
eval.lasso.k4$Algorithm = "LASSO"
eval.lasso.k4$Sample = c(rep("In-sample",3), rep("Out-of-sample", 3))
eval.lasso.k4$Lambda = rep(c(0.001, 1, 100),2)
eval.lasso.k4 = eval.lasso.k4[,c(4:6, 1:3)]
rownames(eval.lasso.k4) = 1:6
The average performance using LASSO as classifier and fold 4 as test set is:
Algorithm | Sample | Lambda | True Positive Rate | True Negative Rate | Accuracy |
---|---|---|---|---|---|
LASSO | In-sample | 1e-03 | 1.000000 | 1.000000 | 1.0000000 |
LASSO | In-sample | 1e+00 | 1.000000 | 0.000000 | 0.7766911 |
LASSO | In-sample | 1e+02 | 1.000000 | 0.000000 | 0.7766911 |
LASSO | Out-of-sample | 1e-03 | 0.827044 | 0.255102 | 0.6923077 |
LASSO | Out-of-sample | 1e+00 | 1.000000 | 0.000000 | 0.7644231 |
LASSO | Out-of-sample | 1e+02 | 1.000000 | 0.000000 | 0.7644231 |
Algorithm 2: Linear SVM, \(c \in \{0,001, 1, 100\}\)
#Define function to run linear SVM, return predicted and actual values
run.svm = function(fold.test, c, sample){
train = which(fold!=fold.test)
test = which(fold==fold.test)
fit = svm(x=x[train,], y=factor(y[train]),
kernel="linear", cost=c, probability=TRUE)
if (sample == "in"){
yactual = y[train]
preds = predict(fit, x[train,]) %>% as.character %>% as.numeric
} else if (sample == "out"){
yactual = y[test]
preds = predict(fit, x[test,]) %>% as.character %>% as.numeric
}
return(cbind(preds,yactual))
}
#Run SVM
eval.svm.k4 = NA
for (c in c(0.001, 1, 100)){
result = run.svm(fold.test = 4, c=c, sample="in")
eval = evaluation(result)
eval.svm.k4 = rbind(eval.svm.k4, eval)
}
for (c in c(0.001, 1, 100)){
result = run.svm(fold.test = 4, c=c, sample="out")
eval = evaluation(result)
eval.svm.k4 = rbind(eval.svm.k4, eval)
}
eval.svm.k4 = eval.svm.k4[-1,]
eval.svm.k4 = eval.svm.k4 %>% as.data.frame
eval.svm.k4$Algorithm = "Linear SVM"
eval.svm.k4$Sample = c(rep("In-sample", 3), rep("Out-of-sample", 3))
eval.svm.k4$Cost = rep(c(0.001, 1, 100), 2)
eval.svm.k4 = eval.svm.k4[,c(4:6, 1:3)]
rownames(eval.svm.k4) = 1:6
The average performance using Linear SVM as classifier and fold 4 as test set is:
Algorithm | Sample | Cost | True Positive Rate | True Negative Rate | Accuracy |
---|---|---|---|---|---|
Linear SVM | In-sample | 1e-03 | 0.9968520 | 0.9343066 | 0.9828851 |
Linear SVM | In-sample | 1e+00 | 1.0000000 | 1.0000000 | 1.0000000 |
Linear SVM | In-sample | 1e+02 | 1.0000000 | 1.0000000 | 1.0000000 |
Linear SVM | Out-of-sample | 1e-03 | 0.8333333 | 0.2755102 | 0.7019231 |
Linear SVM | Out-of-sample | 1e+00 | 0.8238994 | 0.3367347 | 0.7091346 |
Linear SVM | Out-of-sample | 1e+02 | 0.8238994 | 0.3367347 | 0.7091346 |
Preparation: in-sample and out-of-sample weights
weight.out = table(fold) %>% as.numeric / length(fold)
weight.in = 1-weight.out
(a) LASSO
eval.lasso = NA
for (sample in c("in", "out")){
for (l in c(0.001, 1, 100)){
for (k in 1:4){
result = run.lasso(fold.test=k, lam=l, sample=sample)
eval = evaluation(result)
info = c("LASSO", sample, l, k)
eval.temp = c(info, eval)
eval.lasso = rbind(eval.lasso, eval.temp)
#print(eval)
#print(paste(k, l, sample, sep=" "))
}
}
}
eval.lasso.back = eval.lasso #back up
eval.lasso = eval.lasso[-1,]
eval.lasso = eval.lasso %>% as.data.frame
colnames(eval.lasso)[1:4] = c("Algorithm", "Sample", "Lambda", "Test fold")
rownames(eval.lasso) = 1:24
eval.lasso$Sample = ifelse(eval.lasso$Sample=="in", "In-sample", "Out-of-sample")
eval.lasso[5:7] = apply(eval.lasso[5:7], 2, function(x) x%>%as.character%>%as.numeric)
eval.lasso$Weight = c(rep(weight.in, 3), rep(weight.out, 3))
eval.lasso.weighted = NA
for (i in 0:2){
eval.lasso.weighted = rbind(eval.lasso.weighted, apply(eval.lasso[(1+i*4):(4+i*4), 5:7], 2, function(x) weighted.mean(x, weight.in, na.rm=T)))
}
for (i in 3:5){
eval.lasso.weighted = rbind(eval.lasso.weighted, apply(eval.lasso[(1+i*4):(4+i*4), 5:7], 2, function(x) weighted.mean(x, weight.out, na.rm=T)))
}
eval.lasso.weighted = eval.lasso.weighted[-1,]
eval.lasso.weighted = eval.lasso.weighted %>% as.data.frame
eval.lasso.weighted$Algorithm = "LASSO"
eval.lasso.weighted$Sample = c(rep("In-sample", 3), rep("Out-of-sample", 3))
eval.lasso.weighted$Lambda = rep(c(0.001, 1, 100), 2)
eval.lasso.weighted = eval.lasso.weighted[,c(4:6, 1:3)]
rownames(eval.lasso.weighted) = 1:6
The average weighted performance by using LASSO is:
Algorithm | Sample | Lambda | True Positive Rate | True Negative Rate | Accuracy |
---|---|---|---|---|---|
LASSO | In-sample | 1e-03 | 1.0000000 | 1.0000000 | 1.0000000 |
LASSO | In-sample | 1e+00 | 1.0000000 | 0.0000000 | 0.7735849 |
LASSO | In-sample | 1e+02 | 1.0000000 | 0.0000000 | 0.7735849 |
LASSO | Out-of-sample | 1e-03 | 0.8849079 | 0.2055443 | 0.7212416 |
LASSO | Out-of-sample | 1e+00 | 1.0000000 | 0.0000000 | 0.7735849 |
LASSO | Out-of-sample | 1e+02 | 1.0000000 | 0.0000000 | 0.7735849 |
(b) Linear SVM
eval.svm = NA
for(sample in c("in", "out")){
for(c in c(0.001, 1, 100)){
for (k in 1:4){
result = run.svm(fold.test = k, c=c, sample = sample)
eval = evaluation(result)
info = c("Linear SVM", sample, l, k)
eval.temp = c(info, eval)
eval.svm = rbind(eval.svm, eval.temp)
#print(c(sample, c, k))
}
}
}
eval.svm.back = eval.svm #backup
eval.svm = eval.svm.back[-1,]
eval.svm = eval.svm %>% as.data.frame
colnames(eval.svm)[1:4] = c("Algorithm", "Sample", "Cost", "Test fold")
rownames(eval.svm) = 1:24
eval.svm$Sample = ifelse(eval.svm$Sample=="in", "In-sample", "Out-of-sample")
eval.svm[5:7] = apply(eval.svm[5:7], 2, function(x) x%>%as.character%>%as.numeric)
eval.svm$Weight = c(rep(weight.in, 3), rep(weight.out, 3))
eval.svm.weighted = NA
for (i in 0:2){
eval.svm.weighted = rbind(eval.svm.weighted, apply(eval.svm[(1+i*4):(4+i*4), 5:7], 2, function(x) weighted.mean(x, weight.in, na.rm=T)))
}
for (i in 3:5){
eval.svm.weighted = rbind(eval.svm.weighted, apply(eval.svm[(1+i*4):(4+i*4), 5:7], 2, function(x) weighted.mean(x, weight.out, na.rm=T)))
}
eval.svm.weighted = eval.svm.weighted[-1,]
eval.svm.weighted = eval.svm.weighted %>% as.data.frame
eval.svm.weighted$Algorithm = "Linear SVM"
eval.svm.weighted$Sample = c(rep("In-sample", 3), rep("Out-of-sample", 3))
eval.svm.weighted$Lambda = rep(c(0.001, 1, 100), 2)
eval.svm.weighted = eval.svm.weighted[,c(4:6, 1:3)]
rownames(eval.svm.weighted) = 1:6
The average weighted performance by using Linear SVM is:
Algorithm | Sample | Lambda | True Positive Rate | True Negative Rate | Accuracy |
---|---|---|---|---|---|
Linear SVM | In-sample | 1e-03 | 0.9961360 | 0.8852721 | 0.9734226 |
Linear SVM | In-sample | 1e+00 | 1.0000000 | 1.0000000 | 1.0000000 |
Linear SVM | In-sample | 1e+02 | 1.0000000 | 1.0000000 | 1.0000000 |
Linear SVM | Out-of-sample | 1e-03 | 0.8985372 | 0.2757651 | 0.7474133 |
Linear SVM | Out-of-sample | 1e+00 | 0.8858228 | 0.3805490 | 0.7656726 |
Linear SVM | Out-of-sample | 1e+02 | 0.8858228 | 0.3805490 | 0.7656726 |
Use LASSO with \(\lambda=0.001\)
Justify your decision: It has a good balance across: 1) In-sample and
Out-of-sample performance; 2) TPR and TNR.
#Run final classifier with all labeled data
lasso.final = glmnet(x, y,
family="binomial", alpha=1, lambda=0.001)
Report the sampled video IDs and your hand-coded labels in a table.
Randomly select 10 videos
#Not using seed 08544 this time
set.seed(1123)
nolabel = which(is.na(video$label))
video.nolabel = video[nolabel,]
dfm.nolabel = dfm_tran_prop[nolabel,]
video.select = sample(1:length(nolabel), 10)
The 10 unlabeled videos randomly selected are:
x |
---|
FBI assembling GUN CONFISCATION master list |
How to Bid and Buy on GunBroker.com - Demo |
EXCLUSIVE: Bruce Paddock gives emotional on-camera interview about brother, Las Vegas shooting |
Dashcam clears cop accused of pulling gun on woman |
Excerpt: Active shooter training video |
Gun Shop Talk 1: Things You See And Hear In A Gun Shop |
Remington 870 Upgrades | Shooting USA |
London Murder Rate Overtakes New York City Despite Very Tough Gun Laws. Time To Ban KNIVES? |
Europeans Ask: Why Doesn’t America Act on Gun Violence? |
5 Things I learned at the Gun range |
The sampled video IDs and hand-coded labels are:
Video ID | hand coding |
---|---|
l17uEVfDKX8 | undetermined |
1uKnLau5Wl0 | pro-gun |
X7ttWB9fNSM | pro-gun |
qSQFkxNhrf8 | undetermined |
msHZ_-k6IQA | pro-gun |
2ZURASN5wy4 | pro-gun |
PBu-oSfQrRM | pro-gun |
O-8-hbzQfDg | undetermined |
Gg4DFq5YtY4 | pro-gun |
T7ZBLkgeh7I | undetermined |
Run your classifier on these videos and report the results as
well.
The results by the classifier are:
Video ID | Machine coding |
---|---|
l17uEVfDKX8 | pro-gun |
1uKnLau5Wl0 | pro-gun |
X7ttWB9fNSM | pro-gun |
qSQFkxNhrf8 | pro-gun |
msHZ_-k6IQA | pro-gun |
2ZURASN5wy4 | pro-gun |
PBu-oSfQrRM | pro-gun |
O-8-hbzQfDg | pro-gun |
Gg4DFq5YtY4 | pro-gun |
T7ZBLkgeh7I | pro-gun |
Briefly describe the basis on which your classifier made its decision.
Take a look at the words with largest and smallest coefficients:
Words with largest coefficients are:
word | coef | |
---|---|---|
4992 | naiv | -3.696025 |
1670 | texan | -3.446791 |
3860 | clap | -3.419424 |
5149 | guilt | -3.300987 |
1013 | douglass | -3.243504 |
3041 | conscienc | -3.208078 |
754 | huddl | -2.936500 |
4508 | siren | -2.747020 |
5729 | cynic | -2.535087 |
3756 | marathon | -2.157830 |
Words with smallest coefficients are:
word | coef | |
---|---|---|
4917 | denomin | 1.039424 |
3920 | roger | 1.066361 |
5603 | supervis | 1.089841 |
3953 | fuckin | 1.142647 |
4172 | reveng | 1.181267 |
5287 | vers | 1.211264 |
4711 | parad | 1.320034 |
3897 | analyst | 1.478958 |
3984 | motherfuck | 1.726091 |
5165 | judi | 3.149685 |
It seems that words with negative coefficients are more about crimes and laws, while those with positive coefficients about bad stuff…
#Problem 3
We now investigate the extent to which unsupervised dimension reduction procedures produce results that map onto your previous supervised classifier.
Create the data
set.seed(08544)
X1 = rnorm(1000, mean=0, sd=1)
X2 = rnorm(1000, mean=0, sd=0.75)
X3 = rnorm(1000, mean=0, sd=0.5)
X = cbind(X1, X2, X3)
Unstandardized case
X.svd = svd(X)
X.svd$v[,1]
## [1] 0.9933510 -0.1144430 0.0125147
plot3d(X, aspect = FALSE)
arrow3d(p0=c(0,0,0), p1=X.svd$v[,1], col="red")
Standardize and repeat
X.scaled = scale(X)
X.scaled.svd = svd(X.scaled)
X.scaled.svd$v[,1]
## [1] 0.4585749 -0.6851997 0.5658714
plot3d(X.scaled, aspect = FALSE)
arrow3d(p0=c(0,0,0), p1=X.scaled.svd$v[,1], col="red")
Compare the first right singular vectors
The fist right singular vector of unstandardized data is:
## [1] 0.9933510 -0.1144430 0.0125147
The first right singular vector of standardized data is:
## [1] 0.4585749 -0.6851997 0.5658714
Compare the variance
The variance of different dimensions of the unstandardized data is:
x | |
---|---|
X1 | 1.0075330 |
X2 | 0.5846430 |
X3 | 0.2356383 |
The variance of different dimensions of the standardized data is:
x | |
---|---|
X1 | 1 |
X2 | 1 |
X3 | 1 |
Put together:
RSV (unstandardized) | VAR (unstandardized) | RSV (standardized) | VAR(standardized) | |
---|---|---|---|---|
X1 | 0.9933510 | 1.0075330 | 0.4585749 | 1 |
X2 | -0.1144430 | 0.5846430 | -0.6851997 | 1 |
X3 | 0.0125147 | 0.2356383 | 0.5658714 | 1 |
Center the DTM and also standardize it.
dfm_tran_scaled = scale(dfm_tran_prop, center = TRUE, scale = TRUE)
dfm_tran_alt_scaled = scale(dfm_tran_alt_prop, center=TRUE, scale=TRUE)
Conduct dimension reduction with SVD
#now = Sys.time()
svd.dfm = svd(dfm_tran_scaled)
svd.dfm.alt = svd(dfm_tran_alt_scaled)
#difftime(Sys.time(), now)
Produce a “scree plot” over the top 10 dimensions: Main
preprocessing regime
“scree plot”: Alternative regime
Examine most positive and negative loadings of the first dimension
svd.dfm.d1.load = cbind(svd.dfm$v[,1], dfm_tran_prop %>% colnames) %>% as.data.frame
colnames(svd.dfm.d1.load) = c("load", "word")
svd.dfm.d1.load$load = svd.dfm.d1.load$load %>% as.character %>% as.numeric
svd.dfm.d1.load = svd.dfm.d1.load[order(svd.dfm.d1.load[,1]),]
Most negative loadings of the first dimension are:
load | word | |
---|---|---|
1093 | -0.0841441 | littl |
1366 | -0.0775262 | bit |
1326 | -0.0765709 | barrel |
500 | -0.0708779 | round |
76 | -0.0679575 | pretti |
2082 | -0.0640453 | bolt |
328 | -0.0636798 | rifl |
1347 | -0.0620350 | recoil |
197 | -0.0607834 | realli |
339 | -0.0604994 | trigger |
Most positive loadings of the first dimension are:
load | word | |
---|---|---|
208 | 0.0405757 | happen |
539 | 0.0421604 | amend |
241 | 0.0435178 | kill |
93 | 0.0435928 | state |
6 | 0.0468550 | violenc |
661 | 0.0479577 | school |
190 | 0.0511430 | countri |
39 | 0.0512496 | law |
97 | 0.0529350 | say |
240 | 0.0648527 | peopl |
Qualitatively, what does this appear to
capture?
It seems that most positive loadings are about violence and public
places, where negative ones are unclear.
Sample videos that score in the top and bottom deciles on the first dimension
load.d1 = svd.dfm$u[,1]
load.d1 = data.frame(video = 1:3800,
load = load.d1)
load.d1 = load.d1[order(load.d1$load),]
set.seed(08544)
load.d1.sample.top = sample(load.d1$video[1:380], 10)
load.d1.sample.bot = sample(load.d1$video[3421:3800], 10)
top = video.data[load.d1.sample.top,]
bot = video.data[load.d1.sample.bot,]
The average number of words of 10 videos sampled from the top docile is:
## [1] 1604.4
The average number of words of 10 videos sampled from the bottom docile is:
ntoken(toke_tran[load.d1.sample.bot]) %>% mean
## [1] 672.8
The average number of words of all videos is:
## [1] 1717.635
Describe your results. Do they confirm your initial assessment? How do videos at the extremes compare to each other (e.g. in word count, duration, views, etc.)? To the average video?
It seems that the videos of the top decile have fewer words, thus smaller duration.
Repeat this procedure for the second dimension
svd.dfm.d2.load = cbind(svd.dfm$v[,2], dfm_tran_prop %>% colnames) %>% as.data.frame
colnames(svd.dfm.d2.load) = c("load", "word")
svd.dfm.d2.load$load = svd.dfm.d2.load$load %>% as.character %>% as.numeric
svd.dfm.d2.load = svd.dfm.d2.load[order(svd.dfm.d2.load[,1]),]
Most positive loadings of the second dimension are:
load | word | |
---|---|---|
36 | -0.0788966 | just |
152 | -0.0724119 | got |
72 | -0.0663637 | yeah |
186 | -0.0610026 | oh |
15 | -0.0582699 | know |
40 | -0.0580642 | go |
71 | -0.0572401 | okay |
57 | -0.0561068 | like |
81 | -0.0545385 | gonna |
78 | -0.0535242 | get |
Most negative loadings of the second dimension are:
load | word | |
---|---|---|
1103 | 0.0643904 | feder |
308 | 0.0646637 | includ |
2788 | 0.0673907 | prohibit |
2379 | 0.0694540 | misinform |
1608 | 0.0734892 | requir |
39 | 0.0746128 | law |
489 | 0.0800166 | various |
3374 | 0.0842344 | kraut |
93 | 0.0848819 | state |
320 | 0.0853443 | firearm |
Qualitatively, what does this appear to
capture?
The negative ones are not very clear… The positive ones look like about
regulations.
Sample videos that score in the top and bottom deciles on the second dimension
load.d2 = svd.dfm$u[,2]
load.d2 = data.frame(video = 1:3800,
load = load.d2)
load.d2 = load.d2[order(load.d2$load),]
set.seed(08544)
load.d2.sample.top = sample(load.d2$video[1:380], 10)
load.d2.sample.bot = sample(load.d2$video[3421:3800], 10)
top = video.data[load.d2.sample.top,]
bot = video.data[load.d2.sample.bot,]
The average number of words of 10 videos sampled from the top docile is:
## [1] 920.6
The average number of words of 10 videos sampled from the bottom docile is:
ntoken(toke_tran[load.d2.sample.bot]) %>% mean
## [1] 1335.4
The average number of words of all videos is:
## [1] 1717.635
Describe your results. Do they confirm your initial
assessment? How do videos at the extremes compare to each other (e.g. in
word count, duration, views, etc.)? To the average video?
Still, it seems that the videos of the top decile have fewer words, thus
smaller duration.
If you had to label your axes in a single word or phrase
(alternatively, with labels for the positive/negative directions), what
would they be?
Dimension 1 will be labelled as violence, and dimension as
regulation/law.
Create a bivariate plot: Fitted values vs. First dimension
Create a second plot: First dimension vs. Second dimension, with fitted values
Plot first dimension scores from both approaches agaisnt each other
Discuss the sensitivity
It seems that the first-dimension scores are almost identical for the
two different preprocessing schemes, which implies that the
preprocessing decisions (specifically, the threshold of removing
infrequent words in this case), is not very sensitive.