Doppelgänger Tweets spaCy Analysis

InfoEpi Lab

Top Words Among Semantic Triplets in Tweets

This section presents visual analyses of the most frequent words found in semantic triplets from tweets collected between April 23 and May 3, 2024. The first image, Figure 1, displays a word cloud highlighting these top words, giving a clear and immediate visual representation of the most dominant themes. The second image, Figure 2, removes the most frequently occurring word, “Ukraine,” from the visualization.

Figure 1: Top words among semantic triplets from tweets collected between April 23 and May 3, 2024

This adjustment allows for a more detailed exploration of the other significant but less dominant topics present in the dataset, providing a broader view of the conversation dynamics during this period. The methodology for these graphics can be found here: Doppelgänger Tweets spaCy Analysis.

Figure 2: The same content as @fig-wordcloud minus the top word, which was Ukraine. This allows us to see other topics that are less popular but still significant.

Methodology

Install spaCy

!pip install spacy
!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

Extract Semantic Triplets

import spacy
import pandas as pd
from collections import defaultdict
import os

# Load the SpaCy model
nlp = spacy.load("en_core_web_lg")

# This function does not read triplets from a file; it extracts triplets from a sentence using SpaCy.
def extract_triplets(sentence):
    doc = nlp(sentence)  # Process the sentence with SpaCy
    triplets = []
    for token in doc:
        if "subj" in token.dep_:
            subject = token.text
            for verb in token.head.children:
                if verb.dep_ in ("aux", "relcl"):
                    predicate = verb.head.text
                else:
                    predicate = token.head.text
                for obj in verb.children:
                    if "obj" in obj.dep_:
                        triplets.append((subject, predicate, obj.text))
    return triplets

file_path = 'text.txt'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
    exit(1)

# Read the file and process each line
all_triplets = []
subject_counts = defaultdict(int)
predicate_counts = defaultdict(int)
object_counts = defaultdict(int)

with open(file_path, 'r', encoding='utf-8') as file:
    headlines = file.readlines()

    for headline in headlines:
        triplets_from_headline = extract_triplets(headline.strip())  # strip() removes leading/trailing whitespace
        all_triplets.extend(triplets_from_headline)
        for triplet in triplets_from_headline:
            subject, predicate, obj = triplet
            subject_counts[subject] += 1
            predicate_counts[predicate] += 1
            object_counts[obj] += 1


print(all_triplets)

top_subjects = sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_predicates = sorted(predicate_counts.items(), key=lambda x: x[1], reverse=True)[:10]
top_objects = sorted(object_counts.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top Subjects:", top_subjects)
print("Top Predicates:", top_predicates)
print("Top Objects:", top_objects)

# Convert the list of triplets to a DataFrame
df_triplets = pd.DataFrame(all_triplets, columns=['Subject', 'Predicate', 'Object'])

# Save the DataFrame to a CSV file
df_triplets.to_csv('no_crypto_triplets_output.csv', index=False)

Find Common Subjects, Predicates, Objects

!pip install nltk
!nltk.download('wordnet')
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from collections import defaultdict

# Load the CSV into a DataFrame
df_triplets = pd.read_csv("no_crypto_triplets_output.csv")

# Extract most frequent subjects, predicates, and objects
top_subjects = df_triplets['Subject'].value_counts().to_dict()
top_predicates = df_triplets['Predicate'].value_counts().to_dict()
top_objects = df_triplets['Object'].value_counts().to_dict()

# Initialize the stemmer
ps = PorterStemmer()

# Function to get synonyms of a word
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')  # replace underscores with spaces
            synonyms.add(synonym)
            synonyms.add(ps.stem(synonym))
    return synonyms

# Stemming and grouping synonyms for subjects, predicates, and objects
def stem_and_group_synonyms(words):
    stemmed_grouped = defaultdict(int)
    for word, count in words.items():
        stemmed_word = ps.stem(word)
        if stemmed_word not in stemmed_grouped:
            stemmed_grouped[stemmed_word] = 0
        stemmed_grouped[stemmed_word] += count

    # Group synonyms
    synonym_grouped = defaultdict(int)
    for word, count in stemmed_grouped.items():
        synonyms = get_synonyms(word)
        if synonyms:
            key = min(synonyms, key=len)  # Use the shortest synonym as the key
        else:
            key = word
        synonym_grouped[key] += count

    return synonym_grouped

# Re-analyze the words after stemming and grouping by synonyms
grouped_subjects = stem_and_group_synonyms(top_subjects)
grouped_predicates = stem_and_group_synonyms(top_predicates)
grouped_objects = stem_and_group_synonyms(top_objects)

sorted_subjects = sorted(grouped_subjects.items(), key=lambda x: x[1], reverse=True)
sorted_predicates = sorted(grouped_predicates.items(), key=lambda x: x[1], reverse=True)
sorted_objects = sorted(grouped_objects.items(), key=lambda x: x[1], reverse=True)

sorted_subjects, sorted_predicates, sorted_objects

Create Word Cloud

!pip install WordCloud
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

# Load the data from the CSV file
df_triplets = pd.read_csv('no_crypto_triplets_output.csv')  # Adjust the path if necessary

# Combine all the text data from the triplets into a single string
text = ' '.join(df_triplets['Subject'].fillna('') + ' ' + 
                df_triplets['Predicate'].fillna('') + ' ' + 
                df_triplets['Object'].fillna(''))

# Function to plot word cloud
def plot_wordcloud(text, stopwords=None, remove_top_word=False, file_name="no_crypto_word_cloud.png"):
    if remove_top_word:
        # Remove the most frequent word
        frequency = WordCloud().process_text(text)
        most_common_word = max(frequency, key=frequency.get)
        stopwords = stopwords if stopwords else set()
        stopwords.add(most_common_word)

    # Create and configure the WordCloud
    wordcloud = WordCloud(width=800, height=400, 
                          background_color='white', 
                          max_words=200, 
                          colormap='Blues',
                          stopwords=stopwords).generate(text)

    # Create a figure and plot space for the word cloud and the label
    fig, ax = plt.subplots(figsize=(10, 6))

    # Display the generated image:
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')

    # Add a label
    label = "Doppelgänger Semantic Triplets"  # Replace with your desired label
    ax.text(0, -0.1, label, fontsize=24, ha='left', transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8))

    # Save the image with the specified filename, ensuring the entire figure (including label) is saved
    plt.savefig(file_name, bbox_inches='tight', pad_inches=1)

    plt.show()

# Custom stopwords (if any)
custom_stopwords = set(STOPWORDS)  # Add any custom stopwords here if needed

# Create the initial word cloud
plot_wordcloud(text, stopwords=custom_stopwords, file_name="no_crypto_word_cloud.png")

# Create the word cloud minus the top word
plot_wordcloud(text, stopwords=custom_stopwords, remove_top_word=True, file_name="no_crypto_word_cloud_without_top_word.png")

Citation

BibTeX citation:

@article{infoepi_lab2024,
  author = {InfoEpi Lab},
  publisher = {Information Epidemiology Lab},
  title = {Doppelgänger {Tweets} {spaCy} {Analysis}},
  journal = {InfoEpi Lab},
  date = {2024-05-08},
  url = {https://infoepi.org/posts/2024/05/08-doppelganger_spaCy.html},
  langid = {en}
}

For attribution, please cite this work as:

InfoEpi Lab. 2024. “Doppelgänger Tweets spaCy Analysis.” InfoEpi Lab, May. https://infoepi.org/posts/2024/05/08-doppelganger_spaCy.html.