Text as Data Project: Headline Topic Modeling
#no automatic data transformation
options(stringsAsFactors = F)
#supress math annotation
options("scipen" = 100, "digits" = 4)
#load packages
library(knitr)
library(kableExtra)
library(DT)
library(tm)
library(topicmodels)
library(reshape2)
library(ggplot2)
library(wordcloud)
library(pals)
library(SnowballC)
library(lda)
library(ldatuning)
library(flextable)
library(tidyverse)
I am using code from a wonderfully helpful tutorial to do some exploratory topic modeling with the headline data.
Schweinberger, Martin. 2022. Topic Modeling with R. Brisbane: The University of Queensland. url: https://slcladal.github.io/topicmodels.html (Version 2022.03.18).
Loading the entirety of the headlines pulled from the New York Times API, I will pre-process and create a corpus object.
I am transforming to lower case, removing English stopwords, removing punctuation, numbers, and stripping white space. I am going to use stemming, to begin the analysis, though I may go back and change this.
Added on after running analyses in this post
I had to create a function to remove ‘curly’ apostrophes after they showed up as top results in 2 models
#load data
textdata <- read.csv("all_headlines.csv")
#load stop words
english_stopwords <- readLines("https://slcladal.github.io/resources/stopwords_en.txt", encoding = "UTF-8")
#create corpus object
corpus <- Corpus(DataframeSource(textdata))
#pre-processing chain
processedCorpus <- tm_map(corpus, content_transformer(tolower))
processedCorpus <- tm_map(processedCorpus, removeWords, english_stopwords)
processedCorpus <- tm_map(processedCorpus, removePunctuation, preserve_intra_word_dashes = TRUE)
processedCorpus <- tm_map(processedCorpus, removeNumbers)
processedCorpus <- tm_map(processedCorpus, stripWhitespace)
processedCorpus <- tm_map(processedCorpus, exchanger1)
processedCorpus <- tm_map(processedCorpus, exchanger2)
The “topicmodels” package requires a Document Term Matrix.
minimumFrequency <- 1
DTM <- DocumentTermMatrix(processedCorpus, control = list(bounds = list(global = c(minimumFrequency, Inf))))
#preview
dim(DTM)
[1] 1872 3422
I need to clean up the matrix to remove empty rows due to the vocabulary being stemmed/stop words being removed.
sel_idx <- slam::row_sums(DTM) > 0
DTM <- DTM[sel_idx, ]
textdata <- textdata[sel_idx, ]
Choose 5 random numbers to pull and use as examples when examining models.
$`180`
[1] "Afghanistan to Release Last Taliban Prisoners, Removing Final Hurdle to Talks"
$`250`
[1] "Trump’s Campaign Talk of Troop Withdrawals Doesn’t Match Military Reality"
$`330`
[1] "U.S. Leaves Behind Afghan Bases — and a Legacy of Land Disputes"
$`430`
[1] "Blinken, Without Leaving Home, Tries to Mend Fences With Allies Abroad"
$`510`
[1] "War, Peace and Taliban Spreadsheets"
Now I can create models to examine the metrics that can lead to choosing the optical number of topics. Since I am unsure about whether the VEM or Gibbs method will be most appropriate I’m going to try both.
#create models with different number of topics
result <- ldatuning::FindTopicsNumber(
DTM,
topics = seq(from = 2, to = 20, by = 1),
metrics = c("CaoJuan2009", "Deveaud2014", "Arun2010", "Griffiths2004"),
method = "Gibbs",
control = list(seed = 11),
verbose = TRUE
)
fit models... done.
calculate metrics:
CaoJuan2009... done.
Deveaud2014... done.
Arun2010... done.
Griffiths2004... done.
After inspecting the results of all four metrics from the “ldatuning” package relevant to the Gibbs method, it seems that a good starting point will be 5 topics, or “K”.
FindTopicsNumber_plot(result)
I’ll set the topic model to run 1,000 iterations
K = 5; V = 3422; M = 1872
Sampling 1000 iterations!
Iteration 25 ...
Iteration 50 ...
Iteration 75 ...
Iteration 100 ...
Iteration 125 ...
Iteration 150 ...
Iteration 175 ...
Iteration 200 ...
Iteration 225 ...
Iteration 250 ...
Iteration 275 ...
Iteration 300 ...
Iteration 325 ...
Iteration 350 ...
Iteration 375 ...
Iteration 400 ...
Iteration 425 ...
Iteration 450 ...
Iteration 475 ...
Iteration 500 ...
Iteration 525 ...
Iteration 550 ...
Iteration 575 ...
Iteration 600 ...
Iteration 625 ...
Iteration 650 ...
Iteration 675 ...
Iteration 700 ...
Iteration 725 ...
Iteration 750 ...
Iteration 775 ...
Iteration 800 ...
Iteration 825 ...
Iteration 850 ...
Iteration 875 ...
Iteration 900 ...
Iteration 925 ...
Iteration 950 ...
Iteration 975 ...
Iteration 1000 ...
Gibbs sampling completed!
#look at posterior distributions
tmResult <- posterior(topicModel)
#format of the resulting object
attributes(tmResult)
$names
[1] "terms" "topics"
#lengthOfVocab
nTerms(DTM)
[1] 3422
#topics are probability distributions over the entire vocabulary
#get beta from results
beta <- tmResult$terms
#K distributions over nTerms(DTM) terms
dim(beta)
[1] 5 3422
#rows in beta sum to 1
rowSums(beta)
1 2 3 4 5
1 1 1 1 1
#size of collection
nDocs(DTM)
[1] 1872
#for every document we have a probability distribution of its contained topics
theta <- tmResult$topics
#nDocs(DTM) distributions over K topics
dim(theta)
[1] 1872 5
#rows in theta sum to 1
rowSums(theta)[1:10]
1 2 3 4 5 6 7 8 9 10
1 1 1 1 1 1 1 1 1 1
Now I can look at the 10 most likely terms within the probabilities of the inferred topics. I’ll take a look at them for each of the 5 topics to get a clearer idea of how the topics are represented in this model.
terms(topicModel, 10)
Topic 1 Topic 2 Topic 3 Topic 4
[1,] "trump" "biden" "taliban" "afghanistan"
[2,] "military" "china" "afghan" "war"
[3,] "house" "world" "afghans" "afghan"
[4,] "democrats" "migrants" "peace" "biden"
[5,] "bounties" "home" "women" "kabul"
[6,] "russia" "refugees" "afghanistan" "exit"
[7,] "russian" "allies" "talks" "withdrawal"
[8,] "top" "takes" "kabul" "troops"
[9,] "general" "pandemic" "forces" "end"
[10,] "intelligence" "faces" "deal" "veterans"
Topic 5
[1,] "pentagon"
[2,] "leader"
[3,] "pakistan"
[4,] "president"
[5,] "killed"
[6,] "troops"
[7,] "iran"
[8,] "attack"
[9,] "guant<e1>namo"
[10,] "defense"
To look at the models more easily, I’ll name the strings with the top 5 most likely terms for each topic.
After looking into the documents, I can visualize the topic distributions within the documents.
N <- length(exampleIds)
# get topic proportions form example documents
topicProportionExamples <- theta[exampleIds,]
colnames(topicProportionExamples) <- topicNames
vizDataFrame <- melt(cbind(data.frame(topicProportionExamples), document = factor(1:N)), variable.name = "topic", id.vars = "document")
ggplot(data = vizDataFrame, aes(topic, value, fill = document), ylab = "proportion") +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
facet_wrap(~ document, ncol = N)
I will try to get a more meaningful order of top terms per topic by re-ranking them with a specific score (Chang et al. 2009).
The idea of re-ranking terms is similar to the idea of TF-IDF. The more a term appears in top levels with regard to its probability, the less meaningful it is to describe the topic. Hence, the scoring advanced favors terms to describe a topic.
# re-rank top topic terms for topic names
topicNames <- apply(lda::top.topic.words(beta, 5, by.score = T), 2, paste, collapse = " ")
Sort topics according to their probability within the entire collection:
taliban afghan afghans peace women
0.2084
afghanistan war afghan kabul exit
0.2043
pentagon leader pakistan killed iran
0.1981
trump military house democrats bounties
0.1961
biden china world refugees migrants
0.1932
We count how often a topic appears as a primary topic within a paragraph This method is also called Rank-1.
countsOfPrimaryTopics <- rep(0, K)
names(countsOfPrimaryTopics) <- topicNames
for (i in 1:nDocs(DTM)) {
topicsPerDoc <- theta[i, ] # select topic distribution for document i
# get first element position from ordered list
primaryTopic <- order(topicsPerDoc, decreasing = TRUE)[1]
countsOfPrimaryTopics[primaryTopic] <- countsOfPrimaryTopics[primaryTopic] + 1
}
sort(countsOfPrimaryTopics, decreasing = TRUE)
taliban afghan afghans peace women
431
trump military house democrats bounties
412
biden china world refugees migrants
380
afghanistan war afghan kabul exit
349
pentagon leader pakistan killed iran
300
Sorting topics by the Rank-1 method places topics with rather specific thematic coherences in upper ranks of the list.
The wordcloud is a good preliminary way to look at the topics. I’ll choose topic “3”, because across the five sample documents, topic “3” seems to be the most broad capture of the headline topic prevalence over the time period in the modeling. It is also intuitively not an unreasonable choice.
# visualize topics as word cloud
topicToViz <- 3 # change for your own topic of interest
#topicToViz <- grep('fear', topicNames)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
words <- names(top40terms)
# extract the probabilites of each of the 40 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)
Changing the alpha to a lower level
For alpha values greater than one, the samples start to congregate in the center of the triangle. This means that as alpha gets bigger, your samples will more likely be uniform — that is, represent an even mixture of all the topics. Since that was definitely the case in the first sample, I will lower it from 1.0 to 0.1.
#see alpha from previous model
attr(topicModel, "alpha")
[1] 1
K = 5; V = 3422; M = 1872
Sampling 1000 iterations!
Iteration 25 ...
Iteration 50 ...
Iteration 75 ...
Iteration 100 ...
Iteration 125 ...
Iteration 150 ...
Iteration 175 ...
Iteration 200 ...
Iteration 225 ...
Iteration 250 ...
Iteration 275 ...
Iteration 300 ...
Iteration 325 ...
Iteration 350 ...
Iteration 375 ...
Iteration 400 ...
Iteration 425 ...
Iteration 450 ...
Iteration 475 ...
Iteration 500 ...
Iteration 525 ...
Iteration 550 ...
Iteration 575 ...
Iteration 600 ...
Iteration 625 ...
Iteration 650 ...
Iteration 675 ...
Iteration 700 ...
Iteration 725 ...
Iteration 750 ...
Iteration 775 ...
Iteration 800 ...
Iteration 825 ...
Iteration 850 ...
Iteration 875 ...
Iteration 900 ...
Iteration 925 ...
Iteration 950 ...
Iteration 975 ...
Iteration 1000 ...
Gibbs sampling completed!
#look at posterior distributions
tmResultb <- posterior(topicModelb)
#format of the resulting object
attributes(tmResultb)
$names
[1] "terms" "topics"
#topics are probability distributions over the entire vocabulary
#get beta from results
betab <- tmResultb$terms
#K distributions over nTerms(DTM) terms
dim(betab)
[1] 5 3422
#rows in beta sum to 1
rowSums(betab)
1 2 3 4 5
1 1 1 1 1
#for every document we have a probability distribution of its contained topics
thetab <- tmResultb$topics
#nDocs(DTM) distributions over K topics
dim(thetab)
[1] 1872 5
#rows in theta sum to 1
rowSums(thetab)[1:10]
1 2 3 4 5 6 7 8 9 10
1 1 1 1 1 1 1 1 1 1
Now I can look at the 10 most likely terms within the probabilities of the inferred topics. I’ll take a look at them for each of the 3 topics to get a clearer idea of how the topics are represented in this model.
terms(topicModelb, 10)
Topic 1 Topic 2 Topic 3 Topic 4
[1,] "trump" "troops" "biden" "afghanistan"
[2,] "biden" "afghanistan" "war" "biden"
[3,] "military" "afghan" "afghanistan" "trump"
[4,] "house" "military" "afghan" "world"
[5,] "iran" "pentagon" "exit" "afghan"
[6,] "president" "trump" "veterans" "aid"
[7,] "democrats" "kabul" "migrants" "power"
[8,] "russia" "war" "end" "policy"
[9,] "russian" "years" "withdrawal" "india"
[10,] "bounties" "drone" "faces" "takes"
Topic 5
[1,] "taliban"
[2,] "afghan"
[3,] "afghanistan"
[4,] "kabul"
[5,] "peace"
[6,] "afghans"
[7,] "women"
[8,] "talks"
[9,] "deal"
[10,] "attack"
# get topic proportions form example documents
topicProportionExamplesb <- thetab[exampleIds,]
colnames(topicProportionExamplesb) <- topicNamesb
vizDataFrameb <- melt(cbind(data.frame(topicProportionExamplesb), document = factor(1:N)), variable.name = "topic", id.vars = "document")
ggplot(data = vizDataFrameb, aes(topic, value, fill = document), ylab = "proportion") +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
facet_wrap(~ document, ncol = N)
I will try to get a more meaningful order of top terms per topic by re-ranking them with a specific score (Chang et al. 2009).
The idea of re-ranking terms is similar to the idea of TF-IDF. The more a term appears in top levels with regard to its probability, the less meaningful it is to describe the topic. Hence, the scoring advanced favors terms to describe a topic.
# re-rank top topic terms for topic names
topicNamesb <- apply(lda::top.topic.words(betab, 5, by.score = T), 2, paste, collapse = " ")
Sort topics according to their probability within the entire collection:
taliban afghan peace kabul afghanistan
0.2432
biden war veterans afghanistan exit
0.1992
military pentagon troops kabul drone
0.1990
trump house military biden iran
0.1807
world biden trump policy aid
0.1779
We count how often a topic appears as a primary topic within a paragraph This method is also called Rank-1.
countsOfPrimaryTopicsb <- rep(0, K)
names(countsOfPrimaryTopicsb) <- topicNamesb
for (i in 1:nDocs(DTM)) {
topicsPerDocb <- thetab[i, ] # select topic distribution for document i
# get first element position from ordered list
primaryTopicb <- order(topicsPerDocb, decreasing = TRUE)[1]
countsOfPrimaryTopicsb[primaryTopicb] <- countsOfPrimaryTopicsb[primaryTopicb] + 1
}
sort(countsOfPrimaryTopicsb, decreasing = TRUE)
taliban afghan peace kabul afghanistan
455
military pentagon troops kabul drone
388
biden war veterans afghanistan exit
367
trump house military biden iran
346
world biden trump policy aid
316
Sorting topics by the Rank-1 method places topics with rather specific thematic coherences in upper ranks of the list.
The wordcloud is a good preliminary way to look at the topics. It is clearly more difficult to know the overall topic model prevalence with the lower alpha and its more granular results in the visualization, but using the ranking I’ll look at “4”.
# visualize topics as word cloud
topicToVizb <- 4 # change for your own topic of interest
#topicToViz <- grep('fear', topicNames)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40termsb <- sort(tmResultb$terms[topicToVizb,], decreasing=TRUE)[1:40]
wordsb <- names(top40termsb)
# extract the probabilites of each of the 40 terms
probabilitiesb <- sort(tmResultb$terms[topicToVizb,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(wordsb, probabilitiesb, random.order = FALSE, color = mycolors)
The fact that a topic model conveys of topic probabilities for each document, resp. paragraph in our case, makes it possible to use it for thematic filtering of a collection. AS filter we select only those documents which exceed a certain threshold of their probability value for certain topics (for example, each document which contains topic 4 to more than 20 percent).
topicToFilterb <- 4 # you can set this manually ...
# ... or have it selected by a term in the topic name (e.g. 'children')
#topicToFilterb <- grep('withdrawal', topicNamesb)[1]
topicThresholdb <- 0.2
selectedDocumentIndexesb <- which(thetab[, topicToFilterb] >= topicThresholdb)
filteredCorpusb <- corpus[selectedDocumentIndexesb]
# show length of filtered corpus
filteredCorpusb
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 2
Content: documents: 468
In a last step, we provide a distant view on the topics in the data over time. For this, we aggregate mean topic proportions per month for all of the topics. These aggregated topic proportions can then be visualized, e.g. as a bar plot.
#convert non-graph characters to combat error in grid.Call
topicNamesb=str_replace_all(topicNamesb,"[^[:graph:]]", " ")
# append month information for aggregation
textdata$month <- paste0(substr(textdata$month.ended, 0, 3), "0")
# get mean topic proportions per month
topic_proportion_per_month <- aggregate(thetab, by = list(month = textdata$month), mean)
# set topic names to aggregated columns
colnames(topic_proportion_per_month)[2:(K+1)] <- topicNamesb
# reshape data frame
vizDataFrame <- melt(topic_proportion_per_month, id.vars = "month")
# plot topic proportions per month as bar plot
ggplot(vizDataFrame, aes(x=month, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values = paste0(alphabet(20), "FF"), name = "topic") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
The VEM method gives 3 metrics; the “Griffiths2004” metric is not compatible with this method.
#create models with different number of topics
result2 <- ldatuning::FindTopicsNumber(
DTM,
topics = seq(from = 2, to = 20, by = 1),
metrics = c("CaoJuan2009", "Deveaud2014", "Arun2010", "Griffiths2004"),
method = "VEM",
control = list(seed = 11),
verbose = TRUE
)
'Griffiths2004' is incompatible with 'VEM' method, excluded.
fit models... done.
calculate metrics:
CaoJuan2009... done.
Deveaud2014... done.
Arun2010... done.
After inspecting the results of all three metrics from the “ldatuning” package relevant to the VEM method, it seems that a good starting point will be 3 topics, or “K”.
FindTopicsNumber_plot(result2)
Using the VEM model, the alpha is automatically generated
#look at posterior distributions
tmResult2 <- posterior(topicModel2)
#format of the resulting object
attributes(tmResult2)
$names
[1] "terms" "topics"
#topics are probability distributions over the entire vocabulary
#get beta from results
beta2 <- tmResult2$terms
#K distributions over nTerms(DTM) terms
dim(beta2)
[1] 3 3422
#rows in beta sum to 1
rowSums(beta2)
1 2 3
1 1 1
#for every document we have a probability distribution of its contained topics
theta2 <- tmResult2$topics
#nDocs(DTM) distributions over K topics
dim(theta2)
[1] 1872 3
#rows in theta sum to 1
rowSums(theta2)[1:10]
1 2 3 4 5 6 7 8 9 10
1 1 1 1 1 1 1 1 1 1
Now I can look at the 10 most likely terms within the probabilities of the inferred topics. I’ll take a look at them for each of the 5 topics to get a clearer idea of how the topics are represented in this model.
terms(topicModel2, 10)
Topic 1 Topic 2 Topic 3
[1,] "afghan" "taliban" "afghanistan"
[2,] "taliban" "afghans" "trump"
[3,] "afghanistan" "biden" "war"
[4,] "biden" "afghan" "biden"
[5,] "peace" "china" "troops"
[6,] "leader" "trump" "afghan"
[7,] "talks" "kabul" "military"
[8,] "women" "democrats" "kabul"
[9,] "forces" "war" "exit"
[10,] "attack" "veterans" "end"
To look at the models more easily, I’ll name the strings with the top 5 most likely terms for each topic.
After looking into the documents, I can visualize the topic distributions within the documents.
N <- length(exampleIds)
# get topic proportions form example documents
topicProportionExamples2 <- theta2[exampleIds,]
colnames(topicProportionExamples2) <- topicNames2
vizDataFrame2 <- melt(cbind(data.frame(topicProportionExamples2), document = factor(1:N)), variable.name = "topic", id.vars = "document")
ggplot(data = vizDataFrame2, aes(topic, value, fill = document), ylab = "proportion") +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
facet_wrap(~ document, ncol = N)
I will try to get a more meaningful order of top terms per topic by re-ranking them with a specific score (Chang et al. 2009).
The idea of re-ranking terms is similar to the idea of TF-IDF. The more a term appears in top levels with regard to its probability, the less meaningful it is to describe the topic. Hence, the scoring advanced favors terms to describe a topic.
# re-rank top topic terms for topic names
topicNames2 <- apply(lda::top.topic.words(beta2, 5, by.score = T), 2, paste, collapse = " ")
Sort topics according to their probability within the entire collection:
taliban peace leader talks afghanistan
0.3411
afghanistan troops trump war exit
0.3409
afghans taliban china migrants democrats
0.3180
We count how often a topic appears as a primary topic within a paragraph This method is also called Rank-1.
countsOfPrimaryTopics2 <- rep(0, K2)
names(countsOfPrimaryTopics2) <- topicNames2
for (i in 1:nDocs(DTM)) {
topicsPerDoc2 <- theta2[i, ] # select topic distribution for document i
# get first element position from ordered list
primaryTopic2 <- order(topicsPerDoc2, decreasing = TRUE)[1]
countsOfPrimaryTopics2[primaryTopic2] <- countsOfPrimaryTopics2[primaryTopic2] + 1
}
sort(countsOfPrimaryTopics2, decreasing = TRUE)
taliban peace leader talks afghanistan
646
afghanistan troops trump war exit
636
afghans taliban china migrants democrats
590
Sorting topics by the Rank-1 method places topics with rather specific thematic coherences in upper ranks of the list.
The wordcloud is a good preliminary way to look at the topics. I’ll look at topic 2, because the ranking validates what was my intuitive guess. In the visualization of this model the topics are all distributed so broadly this doesn’t help much in this case.
#visualize topics as word cloud
topicToViz2 <- 2 # change for your own topic of interest
#topicToViz <- grep('fear', topicNames)[1] # Or select a topic by a term contained in its name
#select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms2 <- sort(tmResult2$terms[topicToViz2,], decreasing=TRUE)[1:40]
words2 <- names(top40terms2)
#convert non-graph characters to combat error in wordcloud
words2=str_replace_all(words2,"[^[:graph:]]", " ")
#extract the probabilites of each of the 40 terms
probabilities2 <- sort(tmResult2$terms[topicToViz2,], decreasing=TRUE)[1:40]
#visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words2, probabilities2, random.order = FALSE, color = mycolors)
Changing the alpha to the lower level from 0.1228 to 0.03
# see alpha from previous model
attr(topicModel2, "alpha")
[1] 0.119
#look at posterior distributions
tmResult2b <- posterior(topicModel2b)
#format of the resulting object
attributes(tmResult2b)
$names
[1] "terms" "topics"
#topics are probability distributions over the entire vocabulary
#get beta from results
beta2b <- tmResult2b$terms
#K distributions over nTerms(DTM) terms
dim(beta2b)
[1] 3 3422
#rows in beta sum to 1
rowSums(beta2b)
1 2 3
1 1 1
#for every document we have a probability distribution of its contained topics
theta2b <- tmResult2b$topics
#nDocs(DTM) distributions over K topics
dim(theta2b)
[1] 1872 3
#rows in theta sum to 1
rowSums(theta2b)[1:10]
1 2 3 4 5 6 7 8 9 10
1 1 1 1 1 1 1 1 1 1
Now I can look at the 10 most likely terms within the probabilities of the inferred topics. I’ll take a look at them for each of the 5 topics to get a clearer idea of how the topics are represented in this model.
terms(topicModel2b, 10)
Topic 1 Topic 2 Topic 3
[1,] "trump" "afghanistan" "afghan"
[2,] "afghanistan" "biden" "taliban"
[3,] "afghan" "taliban" "war"
[4,] "biden" "afghan" "biden"
[5,] "taliban" "house" "afghanistan"
[6,] "troops" "military" "kabul"
[7,] "military" "pakistan" "peace"
[8,] "pentagon" "kabul" "afghans"
[9,] "bounties" "defense" "pentagon"
[10,] "afghans" "war" "talks"
# get topic proportions form example documents
topicProportionExamples2b <- theta2b[exampleIds,]
colnames(topicProportionExamples2b) <- topicNames2b
vizDataFrame2b <- melt(cbind(data.frame(topicProportionExamples2b), document = factor(1:N)), variable.name = "topic", id.vars = "document")
ggplot(data = vizDataFrame2b, aes(topic, value, fill = document), ylab = "proportion") +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
facet_wrap(~ document, ncol = N)
I will try to get a more meaningful order of top terms per topic by re-ranking them with a specific score (Chang et al. 2009).
The idea of re-ranking terms is similar to the idea of TF-IDF. The more a term appears in top levels w.r.t. its probability, the less meaningful it is to describe the topic. Hence, the scoring advanced favors terms to describe a topic.
# re-rank top topic terms for topic names
topicNames2b <- apply(lda::top.topic.words(beta2b, 5, by.score = T), 2, paste, collapse = " ")
Sort topics according to their probability within the entire collection:
peace pentagon war talks taliban
0.3615
trump bounties congress pentagon general
0.3250
defense pullout house pakistan secretary
0.3135
We count how often a topic appears as a primary topic within a paragraph This method is also called Rank-1.
countsOfPrimaryTopics2b <- rep(0, K2)
names(countsOfPrimaryTopics2b) <- topicNames2b
for (i in 1:nDocs(DTM)) {
topicsPerDoc2b <- theta2b[i, ] # select topic distribution for document i
# get first element position from ordered list
primaryTopic2b <- order(topicsPerDoc2b, decreasing = TRUE)[1]
countsOfPrimaryTopics2b[primaryTopic2b] <- countsOfPrimaryTopics2b[primaryTopic2b] + 1
}
sort(countsOfPrimaryTopics2b, decreasing = TRUE)
peace pentagon war talks taliban
675
trump bounties congress pentagon general
610
defense pullout house pakistan secretary
587
Sorting topics by the Rank-1 method places topics with rather specific thematic coherences in upper ranks of the list.
The wordcloud is a good preliminary way to look at the topics. I’ll look at topic 3, again based on the ranking predominance.
#visualize topics as word cloud
topicToViz2b <- 3 # change for your own topic of interest
#topicToViz <- grep('fear', topicNames)[1] # Or select a topic by a term contained in its name
#select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms2b <- sort(tmResult2b$terms[topicToViz2b,], decreasing=TRUE)[1:40]
words2b <- names(top40terms2b)
#convert non-graph characters to combat error in wordcloud
words2b=str_replace_all(words2,"[^[:graph:]]", " ")
#extract the probabilites of each of the 40 terms
probabilities2b <- sort(tmResult2b$terms[topicToViz2b,], decreasing=TRUE)[1:40]
#visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words2b, probabilities2b, random.order = FALSE, color = mycolors)
The fact that a topic model conveys of topic probabilities for each document, resp. paragraph in our case, makes it possible to use it for thematic filtering of a collection. AS filter we select only those documents which exceed a certain threshold of their probability value for certain topics (for example, each document which contains the topic “Withdrawal” (topic 2) to more than 20 percent).
topicToFilter2b <- 3 # you can set this manually ...
# ... or have it selected by a term in the topic name (e.g. 'children')
#topicToFilter2b <- grep('withdrawal', topicNames2)[1]
topicThreshold2b <- 0.2
selectedDocumentIndexes2b <- which(theta2b[, topicToFilter2b] >= topicThreshold2b)
filteredCorpus2b <- corpus[selectedDocumentIndexes2b]
# show length of filtered corpus
filteredCorpus2b
<<SimpleCorpus>>
Metadata: corpus specific: 1, document level (indexed): 2
Content: documents: 718
In a last step, we provide a distant view on the topics in the data over time. For this, we aggregate mean topic proportions per month for all of the topics. These aggregated topic proportions can then be visualized, e.g. as a bar plot.
#convert non-graph characters to combat error in grid.Call
topicNames2b=str_replace_all(topicNames2b,"[^[:graph:]]", " ")
# append month information for aggregation
textdata$month <- paste0(substr(textdata$month.ended, 0, 3), "0")
# get mean topic proportions per month
topic_proportion_per_month2b <- aggregate(theta2b, by = list(month = textdata$month), mean)
# set topic names to aggregated columns
colnames(topic_proportion_per_month2b)[2:(K2+1)] <- topicNames2b
# reshape data frame
vizDataFrame2b <- melt(topic_proportion_per_month2b, id.vars = "month")
# plot topic proportions per month as bar plot
ggplot(vizDataFrame2b, aes(x=month, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values = paste0(alphabet(20), "FF"), name = "topic") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Unfortunately, the month variable in my data is giving me a lot of trouble visualizing in chronological order.
Since the date column is being difficult, I will try this visualization by article number (doc_id), since the numbers have been generated by date published before being grouped by month.
textdatacleaned <- read.csv("textdatacleaned.csv")
#convert non-graph characters to combat error in grid.Call
#topicNames2b=str_replace_all(topicNames2b,"[^[:graph:]]", " ")
# append month information for aggregation
#textdata$month <- paste0(substr(textdata$month.ended, 0, 3), "0")
# get mean topic proportions per month
topic_proportion_per_month <- aggregate(theta, by = list(month = textdatacleaned$month), mean)
# set topic names to aggregated columns
colnames(topic_proportion_per_month)[2:(K+1)] <- topicNames
# reshape data frame
vizDataFrame <- melt(topic_proportion_per_month, id.vars = "month")
# plot topic proportions per month as bar plot
ggplot(vizDataFrame, aes(x=month, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values = paste0(alphabet(20), "FF"), name = "topic") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
textdatacleaned <- read.csv("textdatacleaned.csv")
#convert non-graph characters to combat error in grid.Call
#topicNames2b=str_replace_all(topicNames2b,"[^[:graph:]]", " ")
# append month information for aggregation
#textdata$month <- paste0(substr(textdata$month.ended, 0, 3), "0")
# get mean topic proportions per month
topic_proportion_per_month2b <- aggregate(theta2b, by = list(month = textdatacleaned$month), mean)
# set topic names to aggregated columns
colnames(topic_proportion_per_month2b)[2:(K2+1)] <- topicNames2b
# reshape data frame
vizDataFrame2b <- melt(topic_proportion_per_month2b, id.vars = "month")
# plot topic proportions per month as bar plot
ggplot(vizDataFrame2b, aes(x=month, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values = paste0(alphabet(20), "FF"), name = "topic") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
That gives us a much better idea of how the topics changed over the course of the 19 months being analyzed!