Top Words Among Semantic Triplets in Tweets
This section presents visual analyses of the most frequent words found in semantic triplets from tweets collected between April 23 and May 3, 2024. The first image, Figure 1, displays a word cloud highlighting these top words, giving a clear and immediate visual representation of the most dominant themes. The second image, Figure 2, removes the most frequently occurring word, “Ukraine,” from the visualization.
This adjustment allows for a more detailed exploration of the other significant but less dominant topics present in the dataset, providing a broader view of the conversation dynamics during this period. The methodology for these graphics can be found here: Doppelgänger Tweets spaCy Analysis.
Methodology
Install spaCy
!pip install spacy
!python -m spacy download en_core_web_lg
import spacy
= spacy.load("en_core_web_lg") nlp
Extract Semantic Triplets
import spacy
import pandas as pd
from collections import defaultdict
import os
# Load the SpaCy model
= spacy.load("en_core_web_lg")
nlp
# This function does not read triplets from a file; it extracts triplets from a sentence using SpaCy.
def extract_triplets(sentence):
= nlp(sentence) # Process the sentence with SpaCy
doc = []
triplets for token in doc:
if "subj" in token.dep_:
= token.text
subject for verb in token.head.children:
if verb.dep_ in ("aux", "relcl"):
= verb.head.text
predicate else:
= token.head.text
predicate for obj in verb.children:
if "obj" in obj.dep_:
triplets.append((subject, predicate, obj.text))return triplets
= 'text.txt'
file_path
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
1)
exit(
# Read the file and process each line
= []
all_triplets = defaultdict(int)
subject_counts = defaultdict(int)
predicate_counts = defaultdict(int)
object_counts
with open(file_path, 'r', encoding='utf-8') as file:
= file.readlines()
headlines
for headline in headlines:
= extract_triplets(headline.strip()) # strip() removes leading/trailing whitespace
triplets_from_headline
all_triplets.extend(triplets_from_headline)for triplet in triplets_from_headline:
= triplet
subject, predicate, obj += 1
subject_counts[subject] += 1
predicate_counts[predicate] += 1
object_counts[obj]
print(all_triplets)
= sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_subjects = sorted(predicate_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_predicates = sorted(object_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_objects
print("Top Subjects:", top_subjects)
print("Top Predicates:", top_predicates)
print("Top Objects:", top_objects)
# Convert the list of triplets to a DataFrame
= pd.DataFrame(all_triplets, columns=['Subject', 'Predicate', 'Object'])
df_triplets
# Save the DataFrame to a CSV file
'no_crypto_triplets_output.csv', index=False) df_triplets.to_csv(
Find Common Subjects, Predicates, Objects
!pip install nltk
!nltk.download('wordnet')
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from collections import defaultdict
# Load the CSV into a DataFrame
= pd.read_csv("no_crypto_triplets_output.csv")
df_triplets
# Extract most frequent subjects, predicates, and objects
= df_triplets['Subject'].value_counts().to_dict()
top_subjects = df_triplets['Predicate'].value_counts().to_dict()
top_predicates = df_triplets['Object'].value_counts().to_dict()
top_objects
# Initialize the stemmer
= PorterStemmer()
ps
# Function to get synonyms of a word
'wordnet')
nltk.download(
def get_synonyms(word):
= set()
synonyms for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
= lemma.name().replace('_', ' ') # replace underscores with spaces
synonym
synonyms.add(synonym)
synonyms.add(ps.stem(synonym))return synonyms
# Stemming and grouping synonyms for subjects, predicates, and objects
def stem_and_group_synonyms(words):
= defaultdict(int)
stemmed_grouped for word, count in words.items():
= ps.stem(word)
stemmed_word if stemmed_word not in stemmed_grouped:
= 0
stemmed_grouped[stemmed_word] += count
stemmed_grouped[stemmed_word]
# Group synonyms
= defaultdict(int)
synonym_grouped for word, count in stemmed_grouped.items():
= get_synonyms(word)
synonyms if synonyms:
= min(synonyms, key=len) # Use the shortest synonym as the key
key else:
= word
key += count
synonym_grouped[key]
return synonym_grouped
# Re-analyze the words after stemming and grouping by synonyms
= stem_and_group_synonyms(top_subjects)
grouped_subjects = stem_and_group_synonyms(top_predicates)
grouped_predicates = stem_and_group_synonyms(top_objects)
grouped_objects
= sorted(grouped_subjects.items(), key=lambda x: x[1], reverse=True)
sorted_subjects = sorted(grouped_predicates.items(), key=lambda x: x[1], reverse=True)
sorted_predicates = sorted(grouped_objects.items(), key=lambda x: x[1], reverse=True)
sorted_objects
sorted_subjects, sorted_predicates, sorted_objects
Create Word Cloud
!pip install WordCloud
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
# Load the data from the CSV file
= pd.read_csv('no_crypto_triplets_output.csv') # Adjust the path if necessary
df_triplets
# Combine all the text data from the triplets into a single string
= ' '.join(df_triplets['Subject'].fillna('') + ' ' +
text 'Predicate'].fillna('') + ' ' +
df_triplets['Object'].fillna(''))
df_triplets[
# Function to plot word cloud
def plot_wordcloud(text, stopwords=None, remove_top_word=False, file_name="no_crypto_word_cloud.png"):
if remove_top_word:
# Remove the most frequent word
= WordCloud().process_text(text)
frequency = max(frequency, key=frequency.get)
most_common_word = stopwords if stopwords else set()
stopwords
stopwords.add(most_common_word)
# Create and configure the WordCloud
= WordCloud(width=800, height=400,
wordcloud ='white',
background_color=200,
max_words='Blues',
colormap=stopwords).generate(text)
stopwords
# Create a figure and plot space for the word cloud and the label
= plt.subplots(figsize=(10, 6))
fig, ax
# Display the generated image:
='bilinear')
ax.imshow(wordcloud, interpolation'off')
ax.axis(
# Add a label
= "Doppelgänger Semantic Triplets" # Replace with your desired label
label 0, -0.1, label, fontsize=24, ha='left', transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8))
ax.text(
# Save the image with the specified filename, ensuring the entire figure (including label) is saved
='tight', pad_inches=1)
plt.savefig(file_name, bbox_inches
plt.show()
# Custom stopwords (if any)
= set(STOPWORDS) # Add any custom stopwords here if needed
custom_stopwords
# Create the initial word cloud
=custom_stopwords, file_name="no_crypto_word_cloud.png")
plot_wordcloud(text, stopwords
# Create the word cloud minus the top word
=custom_stopwords, remove_top_word=True, file_name="no_crypto_word_cloud_without_top_word.png") plot_wordcloud(text, stopwords
Citation
@article{infoepi_lab2024,
author = {InfoEpi Lab},
publisher = {Information Epidemiology Lab},
title = {Doppelgänger {Tweets} {spaCy} {Analysis}},
journal = {InfoEpi Lab},
date = {2024-05-08},
url = {https://infoepi.org/posts/2024/05/08-doppelganger_spaCy.html},
langid = {en}
}