In [625]:
import pandas as pd 
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
import matplotlib.pyplot as plt
import praw
import numpy as np
import time 
import re
import seaborn as sns
In [4]:
pandas2ri.activate()

# Define the R script to read the RDS file
read_rds = ro.r('readRDS')

# Call the R function `readRDS` from Python
r_data = read_rds('reddit.RDS')


df = pandas2ri.py2rpy(r_data)

df = pd.DataFrame(df)
df = df.T

column_names = ['tag', 'title', 'author', 'comments', 'id', 'num_comments',
       'permalink', 'score', 'upvote_ratio', 'url', 'created_utc',
       'comments_feed']
df.columns = column_names

meta_df = df.copy()
In [5]:
meta_df[:1]
Out[5]:
tag title author comments id num_comments permalink score upvote_ratio url created_utc comments_feed
0 beltline Beltline's Eastside Trail transit plan meets o... cox_ph <praw.models.comment_forest.CommentForest obje... 12l1rb1 248 /r/Atlanta/comments/12l1rb1/beltlines_eastside... 162 0.96 https://www.axios.com/local/atlanta/2023/04/13... 1681416024.0 [[Redditor(name='Thrasher678'), "Just tell me ...
In [6]:
# Read meta_comments_df.csv
meta_comments_df = pd.read_csv('metav2_comments_df.csv', index_col=0)
In [11]:
meta_comments_df[:2]
Out[11]:
comment_id parent_id body author time score subreddit_id thread_id year subreddit_name sentiment bert_sentiment
0 jg5b9y4 NaN Just tell me I will be able to ride it from PC... Thrasher678 2023-04-13 17:20:20 161 t5_2qiq9 12l1rb1 2023 Atlanta 0.0000 0.998572
1 jg51z7v NaN Streetcar was approved as the LPA by the Marta... NaN 2023-04-13 16:20:23 143 t5_2qiq9 12l1rb1 2023 Atlanta 0.4215 0.760574
In [624]:
# using meta_comments_df, plot number of unqiue users in each subreddit


# use plotly to plot number of unqiue users in each subreddit
import plotly.express as px
fig = px.bar(meta_comments_df.groupby('subreddit_name')['author'].nunique(), title='User engaging with "rails-to-trails" on Subreddits')

fig.update_layout(yaxis_title='Number of unique users')
fig.show()
In [468]:
import plotly.graph_objects as go

# Count the number of comments & posts per year
counts = meta_comments_df['year'].value_counts().sort_index()

# Create a bar chart
fig = go.Figure(data=go.Bar(x=counts.index, y=counts.values, marker_color='green'))

# Set layout properties
fig.update_layout(
    title='Number of Comments & Posts per Year',
    xaxis_title='Year',
    yaxis_title='Number of Comments & Posts',
    plot_bgcolor='lightgrey',
    autosize=False,
    width=1000,
    height=500,
)

# Show the plot
fig.show()
In [13]:
# get sentiment of each comment from BERT
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
c:\Users\Mohsin\AppData\Local\Programs\Python\Python310\lib\site-packages\tqdm\auto.py:21: TqdmWarning:

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

WARNING:tensorflow:From c:\Users\Mohsin\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
In [14]:
# Classify a sentence
sentence = "I am not sure about programming but i am thinking!"

from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bertSentiment(sentence):
    # Tokenize the text, ensuring that the sequence length is no more than 512
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:min(len(tokens), 500)]  # Truncate if necessary

    # Convert tokens to string
    text = tokenizer.convert_tokens_to_string(tokens)

    result = classifier(text)
    sentiment_score = result[0]['score'] if result[0]['label'] == 'POSITIVE' else  1 - result[0]['score']
    return sentiment_score
In [15]:
# calculate sentiment of comments by user using VADER
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    return sid.polarity_scores(text)["compound"]
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Mohsin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

Creating dataframe about User Activity¶

In [16]:
comments_count_by_user = {}

for idx, row in meta_comments_df.iterrows():
    user = row['author']
    text = row['body']
    comment_time = row['time']
    # convert time to unix timestamp
    comment_time = time.mktime(time.strptime(comment_time, "%Y-%m-%d %H:%M:%S"))
    upvotes = row['score']
    thread_id = row['thread_id']
    # get the setiment of the comment
    sentiment = get_sentiment(text)

    if user is not None:
        if user not in comments_count_by_user:
            # adding the count of comments by user
            comments_count_by_user[user] = {"user": user,
                                            "comments_count": 1,
                                            "last_comment": comment_time,
                                            "first_comment": comment_time,
                                            "tot_sentiment": sentiment,
                                            "upvotes": upvotes,
                                            "thread_id": [thread_id]}
        else:
            comments_count_by_user[user]["comments_count"] += 1
            comments_count_by_user[user]["tot_sentiment"] += sentiment
            comments_count_by_user[user]["upvotes"] += upvotes
            if thread_id not in comments_count_by_user[user]["thread_id"]:
                comments_count_by_user[user]["thread_id"].append(thread_id)

            if comment_time > comments_count_by_user[user]["last_comment"]:
                comments_count_by_user[user]["last_comment"] = comment_time
            if comment_time < comments_count_by_user[user]["first_comment"]:    
                comments_count_by_user[user]["first_comment"] = comment_time


# counting the authors of OP in the comments feed
for index, entry in meta_df.iterrows():
    user = entry["author"]
    comment_time = entry["created_utc"]
    text = entry["title"]
    sentiment = get_sentiment(text)
    upvotes = entry["score"]
    thread_id = entry["id"]

    if user not in comments_count_by_user:
        comments_count_by_user[user] = {"user": user,
                                        "comments_count": 1,
                                        "last_comment": comment_time,
                                        "first_comment": comment_time,
                                        "tot_sentiment": sentiment,
                                        "upvotes": upvotes,
                                        "thread_id": [thread_id]}
    else:
        comments_count_by_user[user]["comments_count"] += 1
        comments_count_by_user[user]["tot_sentiment"] += sentiment
        comments_count_by_user[user]["upvotes"] += upvotes
        if thread_id not in comments_count_by_user[user]["thread_id"]:
            comments_count_by_user[user]["thread_id"].append(thread_id)

        if comment_time > comments_count_by_user[user]["last_comment"]:
            comments_count_by_user[user]["last_comment"] = comment_time
        if comment_time < comments_count_by_user[user]["first_comment"]:    
            comments_count_by_user[user]["first_comment"] = comment_time 
In [17]:
from datetime import datetime as dt

# convert utc to datetime
def convert_utc_to_datetime(utc):
    return dt.fromtimestamp(utc)
In [18]:
users_df = []
users_df = pd.DataFrame(comments_count_by_user).T

# convert utc to datetime in users_df
users_df["last_comment"] = users_df["last_comment"].apply(convert_utc_to_datetime)
users_df["first_comment"] = users_df["first_comment"].apply(convert_utc_to_datetime)

# calculate frequency of comments by user
users_df["frequency days/comment"] = (users_df["last_comment"] - users_df["first_comment"]).dt.days / users_df["comments_count"]

# round the frequency to whole numbers
users_df["frequency days/comment"] = users_df["frequency days/comment"].astype(int)

# getting the avg sentiment 
users_df["avg_sentiment"] = users_df["tot_sentiment"] / users_df["comments_count"]


fltrd_users_df = users_df[users_df["comments_count"] > 1]

fltrd_users_df["threads_active"] = fltrd_users_df["thread_id"].apply(lambda x: len(x))

fltrd_users_df["comments_count"] = pd.to_numeric(fltrd_users_df["comments_count"], errors='coerce')
C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2531471530.py:20: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2531471530.py:22: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [19]:
fltrd_users_df[(fltrd_users_df["avg_sentiment"] < -0) & (fltrd_users_df["comments_count"] > 30)]
Out[19]:
user comments_count last_comment first_comment tot_sentiment upvotes thread_id frequency days/comment avg_sentiment threads_active
ArchEast ArchEast 214 2023-09-12 09:31:40 2013-12-06 08:20:31 -0.213 2622 [12l1rb1, 13enrrk, y90d3d, zh7q18, ummq0s, 10s... 16 -0.000995 38
Antilon Antilon 31 2023-09-12 12:46:00 2019-06-04 12:23:29 -0.3766 304 [12l1rb1, 13enrrk, 15g8w4p, 1496bj3, 16g4d23, ... 50 -0.012148 8
dbclass dbclass 64 2023-05-13 18:33:17 2020-06-04 13:06:46 -0.6198 1495 [13enrrk, y90d3d, zh7q18, ummq0s, xe4hdo, 12bu... 16 -0.009684 25
possibilistic possibilistic 37 2022-09-14 21:02:01 2019-04-19 16:07:34 -7.7106 79 [xe4hdo, go1inq, vxb0ho, qwdny4, w7qtx7, ftwgi... 33 -0.208395 11
PhileasFoggsTrvlAgt PhileasFoggsTrvlAgt 40 2023-08-09 15:47:11 2019-09-09 11:17:20 -0.6742 507 [142su0m, d8ng48, j9zwqk, jv8bl5, e1i4yo, em1z... 35 -0.016855 20
wpm wpm 57 2023-07-24 20:03:45 2016-04-26 18:29:51 -8.9059 398 [816bus, fp3dhm, 14ypomu, p2ekia, e1i4yo, 4lwr... 46 -0.156244 33
abuchewbacca1995 abuchewbacca1995 66 2023-10-06 14:40:59 2019-05-16 11:59:37 -2.5997 -28 [16n5r1y, bpdlsd, fefedk, 171hc9p] 24 -0.039389 4
ColHaberdasher ColHaberdasher 37 2019-02-08 21:31:00 2019-02-07 16:00:50 -10.2833 94 [ao5rmj] 0 -0.277927 1
jhb42 jhb42 45 2023-06-21 01:24:37 2023-06-09 22:09:02 -6.0877 57 [1458pvy] 0 -0.135282 1
In [35]:
plt.style.use('fivethirtyeight')

# Create a diverging palette
palette = sns.diverging_palette(20, 256,center="dark", n=256)

# Create a function to map the colors
def map_colors(value):
    value_index = int((value - fltrd_users_df["avg_sentiment"].min()) / (fltrd_users_df["avg_sentiment"].max() - fltrd_users_df["avg_sentiment"].min()) * 255)
    return palette[value_index]

# Create the histogram
plt.figure(figsize=(10, 6))
counts, bins, patches = plt.hist(fltrd_users_df["avg_sentiment"], bins=50)

# Color each bin
for patch, leftside, rightside in zip(patches, bins[:-1], bins[1:]):
    x = np.mean([leftside, rightside])
    color = map_colors(x)
    patch.set_facecolor(color)

plt.xlabel("Average Sentiment")
plt.ylabel("Number of Users")
plt.title("Distribution of Average Sentiment by User")
plt.grid(False)
plt.show()
No description has been provided for this image
In [623]:
plt.style.use('fivethirtyeight')

# plot a distribution of the sentiment of comments
plt.figure(figsize=(10, 6))
counts, bins, patches = plt.hist(meta_comments_df["sentiment"], bins=20)
plt.xlabel("Average Sentiment")
plt.ylabel("Number of Users")
plt.title("Distribution of Average Sentiment by User")
plt.grid(False)
plt.show()
No description has been provided for this image
In [36]:
plt.figure(figsize=(10, 8))
#plt.plot(fltrd_users_df["avg_sentiment"],fltrd_users_df["comments_count"],alpha=0.5)

data = np.log10(fltrd_users_df["threads_active"])
colors = np.log10(fltrd_users_df["upvotes"].astype(int))

plt.scatter(data, fltrd_users_df["avg_sentiment"], alpha=0.5, s=colors, c=colors, cmap='viridis', sizes=(20, 200))

plt.title("Average Sentiment Distribution by Users Thread Activity")
plt.colorbar(label='log10(upvotes)')
plt.show()
c:\Users\Mohsin\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:396: RuntimeWarning:

divide by zero encountered in log10

c:\Users\Mohsin\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:396: RuntimeWarning:

invalid value encountered in log10

c:\Users\Mohsin\AppData\Local\Programs\Python\Python310\lib\site-packages\matplotlib\collections.py:996: RuntimeWarning:

invalid value encountered in sqrt

No description has been provided for this image
In [20]:
fltrd_users_df[:3]
Out[20]:
user comments_count last_comment first_comment tot_sentiment upvotes thread_id frequency days/comment avg_sentiment threads_active
Thrasher678 Thrasher678 3 2023-04-13 17:20:20 2017-02-25 19:14:12 1.1361 131 [12l1rb1, 5w59gx] 745 0.3787 2
NaN NaN 4484 2023-10-31 12:17:55 2009-12-18 12:26:25 380.6435 25609 [12l1rb1, 13enrrk, y90d3d, zh7q18, ummq0s, c0j... 1 0.084889 862
wambulancer wambulancer 4 2023-08-22 11:42:11 2023-04-13 16:16:36 -1.2466 472 [12l1rb1, 15y3vnm, 13p0k8o] 32 -0.31165 3
In [36]:
plt.scatter(np.log10(fltrd_users_df["comments_count"]), fltrd_users_df["avg_sentiment"], alpha=0.5)
Out[36]:
<matplotlib.collections.PathCollection at 0x1a9f2bc5fd0>
No description has been provided for this image

So the more you engage, by either comment or posting, the more nuanced you get?

In [37]:
sns.histplot(np.log10(fltrd_users_df["tot_sentiment"].astype(int)), bins=20, kde=True)
c:\Users\Mohsin\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\arraylike.py:396: RuntimeWarning:

divide by zero encountered in log10

c:\Users\Mohsin\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\arraylike.py:396: RuntimeWarning:

invalid value encountered in log10

Out[37]:
<Axes: xlabel='tot_sentiment', ylabel='Count'>
No description has been provided for this image
In [ ]:
 
In [21]:
meta_comments_df[:3]
Out[21]:
comment_id parent_id body author time score subreddit_id thread_id year subreddit_name sentiment bert_sentiment
0 jg5b9y4 NaN Just tell me I will be able to ride it from PC... Thrasher678 2023-04-13 17:20:20 161 t5_2qiq9 12l1rb1 2023 Atlanta 0.0000 0.998572
1 jg51z7v NaN Streetcar was approved as the LPA by the Marta... NaN 2023-04-13 16:20:23 143 t5_2qiq9 12l1rb1 2023 Atlanta 0.4215 0.760574
2 jg51dtf NaN Just a friendly reminder: there is a limitless... wambulancer 2023-04-13 16:16:36 314 t5_2qiq9 12l1rb1 2023 Atlanta 0.0498 0.994863
In [46]:
indepth_df = []

for idx, row in meta_comments_df.iterrows():
    user = row['author']
    text = row['body']
    # convert row to list
    row = row.tolist()

    if user == "killroy200":
        sentiment = get_sentiment(text)
        row.append(sentiment)
        indepth_df.append(row)

    elif user == "Miser":
        sentiment = get_sentiment(text)
        row.append(sentiment)
        indepth_df.append(row)


    elif user == "ArchEast":
        sentiment = get_sentiment(text)
        row.append(sentiment)
        indepth_df.append(row)


indepth_df = pd.DataFrame(indepth_df)
indepth_df.columns = ["comment_id","parent_id","body","author","time","score","subreddit_id","thread_id","year","subreddit_name","sentiment","bert","senti"]
In [83]:
indepth_df
Out[83]:
comment_id parent_id body author time score subreddit_id thread_id year subreddit_name sentiment
0 jg64ntj jg5b9y4 It'll get you close (Centennial Olympic Park).... killroy200 2023-04-13 20:54:36 14 t5_2qiq9 12l1rb1 2023 Atlanta -0.1027
1 jg62ntz jg5bc2b I may be one voice, but I'll speak as a reside... killroy200 2023-04-13 20:39:17 25 t5_2qiq9 12l1rb1 2023 Atlanta -0.7227
2 jg8r9ez jg84ayd A dedicated lane is on the City of Atlanta to ... ArchEast 2023-04-14 11:54:26 4 t5_2qiq9 12l1rb1 2023 Atlanta 0.4588
3 jg85qup jg6zqaz Okay, and then we have light rail for generati... killroy200 2023-04-14 09:30:13 4 t5_2qiq9 12l1rb1 2023 Atlanta 0.5267
4 jg64iaf jg5eqr3 Oh no! Not... frequent, convenient, modern, hi... killroy200 2023-04-13 20:53:25 24 t5_2qiq9 12l1rb1 2023 Atlanta -0.1511
... ... ... ... ... ... ... ... ... ... ... ...
849 i3yj93b i3yhawo Technically I think they think the scooters sh... Miser 2022-04-08 18:04:50 2 t5_34bx9x tz4ro5 2022 MicromobilityNYC -0.6495
850 i3zofmc i3zljpy I don't know if your saw that other thread in ... Miser 2022-04-08 23:24:49 4 t5_34bx9x tz4ro5 2022 MicromobilityNYC 0.9780
851 i467fu3 i44psod They are pretty cheap if you get the subsidize... Miser 2022-04-10 11:15:31 2 t5_34bx9x tz4ro5 2022 MicromobilityNYC 0.6653
852 hqq5rvn hqpvie4 I think the idea would be for pretty local rou... Miser 2021-12-31 15:51:42 2 t5_34bx9x rsz3un 2021 MicromobilityNYC 0.1779
853 hqq7584 hqq68yq Yeah it's one of those things that could exist... Miser 2021-12-31 16:01:24 1 t5_34bx9x rsz3un 2021 MicromobilityNYC 0.3818

854 rows × 11 columns

In [49]:
# plot sentiment by year for each user
plt.figure(figsize=(10, 8))
sns.lineplot(data=indepth_df, x="year", y="sentiment", hue="author")
plt.title("Sentiment by Year")
plt.show()
No description has been provided for this image

Apply sentiment to the dataframe¶

In [45]:
# apply sentiment analysis to the comments
meta_comments_df["sentiment"] = meta_comments_df["body"].apply(get_sentiment)
In [50]:
meta_comments_df["bert_sentiment"] = meta_comments_df["body"].apply(bertSentiment)
In [85]:
meta_comments_df["bert_label"] = meta_comments_df["bert_sentiment"].apply(lambda x: "positive" if x > 0.5 else "negative")
In [ ]:
# export meta_comments_df to csv

meta_comments_df = pd.read_csv('metav2_comments_df.csv', index_col=0)
In [22]:
filter = fltrd_users_df["user"]

# filter the meta_comments_df by the users in fltrd_users_df
select_comments_df = meta_comments_df[meta_comments_df["author"].isin(filter)]
In [57]:
# use plotly to plot sns lineplot sentiment by year by subreddit


plt.figure(figsize=(20, 16))
sns.lineplot(data=select_comments_df, x="year", y="bert_sentiment", hue="subreddit_name")
plt.title("Sentiment by Year")
plt.show()
No description has been provided for this image

Topic Clustering¶

Finding the optimal number of topic clusters using topic coherence¶

In [59]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Downloading NLTK stopwords
nltk.download('wordnet')

# Text preprocessing function
def preprocess(text):
    result = []
    lemmatizer = WordNetLemmatizer()
    for token in text.lower().split():
        token = re.sub(r'\W+', '', token)  # remove non-alphanumeric characters
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatizer.lemmatize(token))
    return result

# Preprocess the text data
processed_docs = meta_comments_df['body'].map(preprocess)

# Create a dictionary representation of the documents
dictionary = Dictionary(processed_docs)

# Filter out words that occur in less than 20 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Create a bag-of-words model for each document
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Range of topic numbers to evaluate
topic_range = range(19,20)

# Store the models and their coherence scores
lda_models = []
coherence_scores = []

for num_topics in topic_range:
    # Create and train the LDA model
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=0)
    lda_models.append(lda)

    # Calculate coherence score
    coherence_model = CoherenceModel(model=lda, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    coherence_scores.append(coherence_model.get_coherence())

# Prepare for plotting
topics_coherence = list(zip(topic_range, coherence_scores))

# Display the coherence scores
topics_coherence
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohsin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[59]:
[(19, 0.5243763802191737)]

Using Hierarchical Dirichlet Process (HDP) for finding optimal topics¶

In [ ]:
from gensim.models import HdpModel

# Training the HDP model
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)

# Getting the topics from HDP
hdp_topics = hdp_model.show_topics(formatted=False)

# Determining the number of topics chosen by HDP
num_topics_hdp = len(hdp_topics)

# Displaying the number of topics and some topics as an example
num_topics_hdp, hdp_topics[:5]  # change hdp_topics to hdp_topics[:5] to see the first 5 topics
In [114]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Text Preprocessing and Tokenization
# We use CountVectorizer for both tokenization and creating a document-term matrix
# We remove English stop words and limit the features to the top 1000 words for efficiency

vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
dtm = vectorizer.fit_transform(meta_comments_df['body'])  # Document-term matrix

# Applying LDA

n_topics = 4
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(dtm)

# Function to display top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")

# Display the top 20 words for each topic
display_topics(lda, vectorizer.get_feature_names_out(), 20)
Topic 1:
bike trail ride way just cars path traffic park rail like road bikes beltline people lanes lane car use street
Topic 2:
https com www deleted org http reddit greenway chicago detroit maps park nyc google comments map new news miles link
Topic 3:
people city just don years think like atlanta new area going beltline money live want make lot public building work
Topic 4:
like just don people good know ve time really did think ll thanks great right got nice pretty love didn


In [73]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Get the topic distribution for each comment
topic_distributions = lda.transform(dtm)

# t-SNE for dimensionality reduction to 2D
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, n_iter=300)
tsne_lda = tsne_model.fit_transform(topic_distributions)

# Assign each comment to the most dominant topic
dominant_topic = np.argmax(topic_distributions, axis=1)

# Creating a scatter plot of the t-SNE output
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x=tsne_lda[:,0], y=tsne_lda[:,1], hue=dominant_topic, palette="deep", ax=ax)

ax.set_title('t-SNE visualization of LDA topic distributions')
ax.set_xlabel('t-SNE Dimension 1')
ax.set_ylabel('t-SNE Dimension 2')
plt.legend(title='Dominant Topic', labels=[f'Topic {i+1}' for i in range(n_topics)])
plt.show()
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 37796 samples in 0.020s...
[t-SNE] Computed neighbors for 37796 samples in 0.613s...
[t-SNE] Computed conditional probabilities for sample 1000 / 37796
[t-SNE] Computed conditional probabilities for sample 2000 / 37796
[t-SNE] Computed conditional probabilities for sample 3000 / 37796
[t-SNE] Computed conditional probabilities for sample 4000 / 37796
[t-SNE] Computed conditional probabilities for sample 5000 / 37796
[t-SNE] Computed conditional probabilities for sample 6000 / 37796
[t-SNE] Computed conditional probabilities for sample 7000 / 37796
[t-SNE] Computed conditional probabilities for sample 8000 / 37796
[t-SNE] Computed conditional probabilities for sample 9000 / 37796
[t-SNE] Computed conditional probabilities for sample 10000 / 37796
[t-SNE] Computed conditional probabilities for sample 11000 / 37796
[t-SNE] Computed conditional probabilities for sample 12000 / 37796
[t-SNE] Computed conditional probabilities for sample 13000 / 37796
[t-SNE] Computed conditional probabilities for sample 14000 / 37796
[t-SNE] Computed conditional probabilities for sample 15000 / 37796
[t-SNE] Computed conditional probabilities for sample 16000 / 37796
[t-SNE] Computed conditional probabilities for sample 17000 / 37796
[t-SNE] Computed conditional probabilities for sample 18000 / 37796
[t-SNE] Computed conditional probabilities for sample 19000 / 37796
[t-SNE] Computed conditional probabilities for sample 20000 / 37796
[t-SNE] Computed conditional probabilities for sample 21000 / 37796
[t-SNE] Computed conditional probabilities for sample 22000 / 37796
[t-SNE] Computed conditional probabilities for sample 23000 / 37796
[t-SNE] Computed conditional probabilities for sample 24000 / 37796
[t-SNE] Computed conditional probabilities for sample 25000 / 37796
[t-SNE] Computed conditional probabilities for sample 26000 / 37796
[t-SNE] Computed conditional probabilities for sample 27000 / 37796
[t-SNE] Computed conditional probabilities for sample 28000 / 37796
[t-SNE] Computed conditional probabilities for sample 29000 / 37796
[t-SNE] Computed conditional probabilities for sample 30000 / 37796
[t-SNE] Computed conditional probabilities for sample 31000 / 37796
[t-SNE] Computed conditional probabilities for sample 32000 / 37796
[t-SNE] Computed conditional probabilities for sample 33000 / 37796
[t-SNE] Computed conditional probabilities for sample 34000 / 37796
[t-SNE] Computed conditional probabilities for sample 35000 / 37796
[t-SNE] Computed conditional probabilities for sample 36000 / 37796
[t-SNE] Computed conditional probabilities for sample 37000 / 37796
[t-SNE] Computed conditional probabilities for sample 37796 / 37796
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 74.110100
[t-SNE] KL divergence after 300 iterations: 3.005099
No description has been provided for this image

creating a keyword co-occurrence network¶

In [74]:
from collections import defaultdict
import itertools


# function to preprocess and extract keywords
def extract_keywords(text):
    # Basic preprocessing to tokenize and remove stop words
    tokens = preprocess(text)
    return tokens

# Extracting keywords from each comment
processed_comments = meta_comments_df['body'].map(extract_keywords)

# Counting word frequencies
word_freq = defaultdict(int)
for comment in processed_comments:
    for word in comment:
        word_freq[word] += 1

# Selecting top N keywords for the analysis
N = 100  # Number of top frequent words to consider
top_keywords = sorted(word_freq, key=word_freq.get, reverse=True)[:N]


# define a function to update the co-occurrence matrix for each comment
def update_cooccurrence_matrix(matrix, comment, keywords):
    for word1, word2 in itertools.combinations(comment, 2):
        if word1 in keywords and word2 in keywords:
            matrix[word1][word2] += 1
            matrix[word2][word1] += 1

# Creating the co-occurrence matrix
cooccurrence_matrix = defaultdict(lambda: defaultdict(int))
for comment in processed_comments:
    update_cooccurrence_matrix(cooccurrence_matrix, comment, top_keywords)

# Convert cooccurrence_matrix to a format suitable for NetworkX
cooccurrence_data = [(word1, word2, cooccurrence_matrix[word1][word2]) 
                     for word1 in cooccurrence_matrix 
                     for word2 in cooccurrence_matrix[word1] 
                     if word1 != word2]

cooccurrence_data[:10]  # Display first 10 pairs 
Out[74]:
[('ride', 'state', 110),
 ('ride', 'street', 466),
 ('ride', 'north', 268),
 ('ride', 'point', 190),
 ('ride', 'city', 495),
 ('ride', 'sure', 206),
 ('ride', 'stop', 215),
 ('ride', 'bike', 1759),
 ('ride', 'people', 693),
 ('ride', 'beltline', 138)]
In [76]:
heatmap_cooccurrence_data = {}

for entry in cooccurrence_data:
    heatmap_cooccurrence_data[(entry[0], entry[1])] = entry[2]
In [77]:
sorted_keys = sorted(heatmap_cooccurrence_data.items(), key=lambda item: item[1], reverse=True)
In [79]:
heatmap_cooccurrence_data = {}

for entry in sorted_keys[:100]:
    heatmap_cooccurrence_data[(entry[0][0], entry[0][1])] = entry[1]
In [80]:
# creating a heatmap
import seaborn as sns
import pandas as pd


# Create a matrix
unique_words = set(word for pair in heatmap_cooccurrence_data for word in pair)
unique_words_list = list(unique_words)  # Convert set to list
cooccurrence_matrix = pd.DataFrame(0, index=unique_words_list, columns=unique_words_list)

for (word1, word2), freq in heatmap_cooccurrence_data.items():
    cooccurrence_matrix.at[word1, word2] = freq
    #cooccurrence_matrix.at[word2, word1] = freq

mask = np.triu(np.ones_like(cooccurrence_matrix, dtype=bool))

# Create the heatmap
plt.figure(figsize=(30, 20))

sns.heatmap(cooccurrence_matrix,mask=mask, cmap='YlGnBu', 
            linewidths=0.1, linecolor='gray')
plt.title('Top 100 Co-occurrence Heatmap')
Out[80]:
Text(0.5, 1.0, 'Top 100 Co-occurrence Heatmap')
No description has been provided for this image

Building a Sentiment Interaction Network?¶

In [23]:
atl_tags = ["Atlanta", "beltine"]
chi_tags = ["Chicago","chibike", "606", "bloomingdale trail"]
det_tags = ["Detroit", "bikedetroit","Joe Louis Greenway", "dequindre cut"]
nyc_tags = ["NYC", "nycbike", "micromobilitynyc","highline"]


# get thread_ids for each city by tags
atl_thread_ids = []
chi_thread_ids = []
det_thread_ids = []
nyc_thread_ids = []

for index, entry in meta_df.iterrows():
    if any(tag in entry["tag"] for tag in atl_tags):
        atl_thread_ids.append(entry["id"])
    elif any(tag in entry["tag"] for tag in chi_tags):
        chi_thread_ids.append(entry["id"])
    elif any(tag in entry["tag"] for tag in det_tags):
        det_thread_ids.append(entry["id"])
    elif any(tag in entry["tag"] for tag in nyc_tags):
        nyc_thread_ids.append(entry["id"])
    
        # Extract the subreddit from the permalink
    subreddit_matches = re.findall(r'/r/([^/]+)', entry["permalink"])
    if not subreddit_matches:
        continue  # Skip this row if no subreddit is found
    subreddit = subreddit_matches[0]

    if subreddit in atl_tags:
        atl_thread_ids.append(entry["id"])
    elif subreddit in chi_tags:
        chi_thread_ids.append(entry["id"])
    elif subreddit in det_tags:
        det_thread_ids.append(entry["id"])
    elif subreddit in nyc_tags:
        nyc_thread_ids.append(entry["id"])

# build a dataframe of comments for each city
atl_comments_df = select_comments_df[select_comments_df["thread_id"].isin(atl_thread_ids)]
chi_comments_df = select_comments_df[select_comments_df["thread_id"].isin(chi_thread_ids)]
det_comments_df = select_comments_df[select_comments_df["thread_id"].isin(det_thread_ids)]
nyc_comments_df = select_comments_df[select_comments_df["thread_id"].isin(nyc_thread_ids)]


atl_comments_df["city_group"] = "Atlanta"
chi_comments_df["city_group"] = "Chicago"
det_comments_df["city_group"] = "Detroit"
nyc_comments_df["city_group"] = "NYC"

city_df = pd.concat([atl_comments_df, chi_comments_df, det_comments_df, nyc_comments_df])
C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2822120659.py:45: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2822120659.py:46: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2822120659.py:47: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2822120659.py:48: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [24]:
plt.figure(figsize=(20, 12))
# line plot width should be count of comments by city
sns.lineplot(data=city_df, x="year", y="bert_sentiment", hue="city_group")

plt.title("Sentiment by Year")
plt.show()
No description has been provided for this image
In [25]:
plt.figure(figsize=(20, 12))
# line plot width should be count of comments by city
sns.lineplot(data=city_df, x="year", y="sentiment", hue="city_group")

plt.title("Sentiment by Year")
plt.show()
No description has been provided for this image

Build a wordmap¶

In [70]:
# build a word cloud for each city

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# Create a word cloud for each city for positive and negative sentiment


for city in city_df['city_group'].unique():
    # Combine all comments for the city into a single string
    text = ' '.join(city_df[city_df['city_group'] == city]['body'])
    
    # Create and generate a word cloud image
    wordcloud = WordCloud(max_font_size=100, max_words=100, background_color="white").generate(text)
    
    # Display the generated image
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'Word Cloud for {city}')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Building topic clustering for each city¶

In [78]:
def get_topic_clusters(text, topic_count):
    #vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')

    dtm = vectorizer.fit_transform(text)  # Document-term matrix

    # Applying LDA

    n_topics = topic_count
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(dtm)

    display_topics(lda, vectorizer.get_feature_names_out(), 20)
In [79]:
get_topic_clusters(atl_comments_df["body"], 4)
Topic 1:
park just like deleted beltline going yeah traffic years area way piedmont bus route line station north really sure right
Topic 2:
https com bike beltline ve like atlanta www just trail creek time bridge day street org reddit scooters right great
Topic 3:
beltline rail transit trail city marta just light way streetcar like project land use tax going new atlanta housing construction
Topic 4:
people don just city like atlanta live want think beltline crime going make know need areas really say parking actually


In [80]:
get_topic_clusters(det_comments_df["body"], 4)
Topic 1:
https detroit com www park deleted joe greenway org trail louis reddit belle riverfront michigan great http isle way building
Topic 2:
just like think city people really don detroit going better new make lot way right ve area years ll nice
Topic 3:
detroit city people like cut don transit downtown good dequindre rail area thank know freeway live time just years midtown
Topic 4:
bike people like just tax don want ride parking time thanks yes going day detroit did lanes know traffic road


In [81]:
get_topic_clusters(chi_comments_df["body"], 4)
Topic 1:
bike just people like 606 ve ride time trail good pretty right path bikes got riding traffic going way yeah
Topic 2:
people don https chicago com just like www car think cars know doesn need bike org want does reddit way
Topic 3:
city deleted like bike just line going really right way think better long infrastructure make street chicago bridge river tracks
Topic 4:
park trail live people north new area city chicago like just don neighborhood removed housing want 606 property really lot


In [82]:
get_topic_clusters(nyc_comments_df["body"], 4)
Topic 1:
deleted like bike just city park don bronx time people new museum thanks ride great want island area ve car
Topic 2:
just like transit high line school people way bus public right walk cool live going station yeah better seattle don
Topic 3:
just time new really highline know great park brooklyn city east lot manhattan don nyc check good ll place bridge
Topic 4:
com http www people like https don just think reddit park line new high thank going org look doing nyc


In [115]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import numpy as np

# Set the number of topics
n_topics = 4

# Create a CountVectorizer instance
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Group comments by month

grouped = atl_comments_df.groupby('year')

# Initialize a dictionary to store the prevalence of each topic over time
topic_prevalence = {i: [] for i in range(n_topics)}

# For each time period.
for time_period, group in grouped:
    # Perform LDA on the comments from this time period
    dtm = vectorizer.fit_transform(group['body'])
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(dtm)
    print(time_period)
    display_topics(lda, vectorizer.get_feature_names_out(), 20)
    

    # For each topic...
    for topic in range(n_topics):
        # Calculate the prevalence of this topic in this time period
        prevalence = np.sum(lda.transform(dtm)[:, topic])

        # Store the prevalence
        topic_prevalence[topic].append(prevalence)

# Plot the prevalence of each topic over time
for topic in range(n_topics):
    plt.plot([group[0] for group in grouped], topic_prevalence[topic], label=f'Topic {topic}')
plt.legend()
plt.show()
2013
Topic 1:
greenway roswell 10 linked pssh really come com http www path people pretty hall kell don gsu garage parking
Topic 2:
path pretty www com http really people roswell come don hall kell pssh greenway linked gsu 10 garage parking
Topic 3:
parking garage people come don hall kell really roswell path pretty com www http pssh greenway gsu linked 10
Topic 4:
kell hall gsu don really people pretty garage parking path roswell come com www http pssh greenway linked 10


2014
Topic 1:
city mural just commercial people council application ordinance saying section make does artist area wall message 46 going artists walls
Topic 2:
beltline trail comet path don silver like connected just use road miles people speech need commercial bike think park good
Topic 3:
bike beltline lane deleted lanes ride people ve trail pedestrians right atlanta traffic left going bikers speed know slow roads
Topic 4:
like art beltline just new living path trail law city way bankhead speech development need maybe yard atlanta area yeah


2015
Topic 1:
https humans youtu segway 4z61pisui3a free hands 200 good picture like don facebook www pretty tomorrow world beltline pic com
Topic 2:
people like atlanta time just need comments beltline guy new yes 400 want puppy trail picture park dogs tomorrow world
Topic 3:
http right trail www new org crossings street sure nice comet smyrna silver pretty area think com know jpg humans
Topic 4:
rides puppies don know ve com thing let really got phones omg looks pic jpg just like peachtree think group


2016
Topic 1:
link park east www com https drive greenway property just connect open trail atlanta west proctor creek http use street
Topic 2:
www property just com connect https east park open atlanta trail drive like http use complete street conversion proctor creek
Topic 3:
greenway creek peachtree trail city beltline projects connect drive like proctor http https com street complete conversion use west open
Topic 4:
atlanta city property park neighborhood www chattahoochee streets http add trails just com open west use street conversion complete proctor


2017
Topic 1:
com https www http reddit people going np trail actually comments plan atlanta beltline service org gatekeeping infrastructure right city
Topic 2:
just service trail atlanta said beltline emotional animals animal support ll city park ride like near road family time deal
Topic 3:
deleted people attention yeah good person thanks like bird raven don got working urbanism nice ll probably ve thing way
Topic 4:
people just city like really greenway know need creek service bike animal make want actually don roads doesn lot area


2018
Topic 1:
city deleted atlanta year years going uga just championship better georgia make way new gs like think need national really
Topic 2:
creek https park won years good greenway know just like com really atlanta schools lot georgia org trail bad people
Topic 3:
bike trail like area beltline time prices new ve people instead sure built thanks commute living transit neighborhoods sandy make
Topic 4:
beltline rail light atlanta trail transit com people just streetcar https marta city reddit right way housing like message think


2019
Topic 1:
deleted like beltline people assault just know rifle think really time gun don saw trail better lights make scooters right
Topic 2:
scooters don people scooter https atlanta beltline like com going think path bird really years city www car just mile
Topic 3:
people like work kroger just atlanta murder good ve bikes make don beltline scooter scooters city way know com yeah
Topic 4:
city beltline just stop atlanta https transit streetcar way right peachtree traffic scooters com marta rail park red seen creek


2020
Topic 1:
like people atlanta time com reddit new just https thanks don know crime different message beltline place going getting 10
Topic 2:
car just beltline security park management like got broken complex edge residents resident unlocked doors lot new right train definitely
Topic 3:
like people deleted don think ve just beltline work way sure good city need know right time going rail cars
Topic 4:
just people trail city run don big like park day atlanta com marta live beltline good order past area time


2021
Topic 1:
marta removed people bus transit atlanta just really service don isn ridership want know crime like coverage need better time
Topic 2:
beltline trail rail just https construction park path bike light way project org years bridge work southside atlanta right lot
Topic 3:
like just trail going beltline people park don ride way bike actually ve westside road sidewalk time city pretty lot
Topic 4:
like route just map ridership routes deleted bus going make good west think midtown doesn transfer downtown beltline yeah way


2022
Topic 1:
people city like just rail beltline don atlanta going live area transit think years marta want housing right make areas
Topic 2:
beltline park bike just trail like atlanta right going way path pretty bridge section piedmont street parking need north rail
Topic 3:
people city make just new like housing storage good food development land want years place better got line ve price
Topic 4:
just crime like trail beltline people don https deleted really time think com city lot good doesn storage atlanta going


2023
Topic 1:
atlanta like people city deleted comments don post years neighbors housing going say yes really future think just want order
Topic 2:
beltline transit rail people just like trail city don streetcar park line want light going way car use bike live
Topic 3:
beltline just like parking data park trail center https city atlanta area don people com space built think way traffic
Topic 4:
tax housing people city affordable beltline don atlanta just units developers new going apartments make development want build way need


No description has been provided for this image
In [144]:
from gensim.corpora import Dictionary

# Prepare texts
#texts = [group['body'].str.split().tolist() for _, group in atl_comments_df.groupby('year')]
texts = [sum(group['body'].str.split().tolist(), []) for _, group in atl_comments_df.groupby('year')]
# Create a dictionary representation of the documents
dictionary = Dictionary(texts)

# Filter out words that occur less than 20 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=2, no_above=0.5)

# Convert documents to a vectorized form by computing frequency of each word
corpus = [dictionary.doc2bow(text) for text in texts]

from gensim.models.wrappers import DtmModel

# Path to the dtm binary, which you'll have to download and install from https://github.com/magsilva/dtm
dtm_path = "D:\\dtm-master\\dtm-master\\bin\\dtm-win64.exe"

# Create a DTM model
# Create a DTM model
model = DtmModel(dtm_path, corpus, [len(texts)], num_topics=n_topics,
                 id2word=dictionary, initialize_lda=True)

# Get the topics
In [ ]:
topics = model.show_topic(topicid=2, time=0, topn=10)
topics
In [221]:
from gensim.corpora import Dictionary
from gensim.models.wrappers import DtmModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
nltk.download('stopwords')

# Sort DataFrame by year
atl_comments_df = atl_comments_df.sort_values('year')

# Group DataFrame by year
grouped = atl_comments_df.groupby('year')

# Initialize a list to hold the corpus and a list to hold the timeslices
corpus = []
timeslices = []

# Initialize a dictionary
dictionary = Dictionary()

# For each year, create a BoW representation and add it to the corpus
for year, group in grouped:
    # Preprocess the text
    # Define the stop words
    stop_words = set(stopwords.words('english'))
    # Add custom stop words
    custom_stop_words = ["deleted", "also", "one", "would"]  # replace with your custom stop words
    stop_words.update(custom_stop_words)

    # Preprocess the text and remove stop words
    texts = group['body'].apply(lambda text: [word for word in simple_preprocess(text) if word not in stop_words])
    
    # Update the dictionary
    dictionary.add_documents(texts)
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    
    # Create the BoW representation
    bow_texts = [dictionary.doc2bow(text) for text in texts]
    
    # Add the BoW representation to the corpus
    corpus.extend(bow_texts)
    
    # Add the number of documents to the timeslices
    timeslices.append(len(bow_texts))
    print("done")

# Now you can create your DTM model
dtm_path =  "D:\\dtm-master\\dtm-master\\bin\\dtm-win64.exe" # replace with the path to your DTM binary
model = DtmModel(dtm_path, corpus, timeslices, num_topics=4, id2word=dictionary, initialize_lda=True)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohsin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
done
done
done
done
done
done
done
done
done
done
done
In [236]:
model.show_topic(topicid=3
                 , time=10, topn=20)
Out[236]:
[(0.018267252943986005, 'housing'),
 (0.017709494001744017, 'city'),
 (0.015103922009484056, 'tax'),
 (0.01371325480889903, 'transit'),
 (0.012715757681092147, 'beltline'),
 (0.012309494700629888, 'people'),
 (0.011857247608576453, 'development'),
 (0.011223392819190128, 'build'),
 (0.010649870458667035, 'affordable'),
 (0.010206822826657719, 'streetcar'),
 (0.009922368676925638, 'built'),
 (0.009469124755784243, 'crime'),
 (0.00915254799747858, 'land'),
 (0.00898241025141571, 'building'),
 (0.00894881463328227, 'atlanta'),
 (0.008723687315576285, 'property'),
 (0.008684034132630157, 'new'),
 (0.00834277220684594, 'need'),
 (0.008173405074660627, 'areas'),
 (0.007989686430305908, 'marta')]
In [228]:
timeslices
Out[228]:
[17, 105, 42, 10, 295, 366, 1033, 678, 961, 1944, 1271]
In [495]:
meta_comments_df['year'].unique().max() + 1
Out[495]:
2024
In [519]:
num_topics = 4

def plot_topic_proportions(df):
    df = df.sort_values('year')

    # Group DataFrame by year
    grouped = df.groupby('year')
        
    # Initialize a list to hold the corpus and a list to hold the timeslices
    corpus = []
    timeslices = []

    # Initialize a dictionary
    dictionary = Dictionary()

    # For each year, create a BoW representation and add it to the corpus
    for year, group in grouped:
        # Preprocess the text
        # Define the stop words
        stop_words = set(stopwords.words('english'))
        # Add custom stop words
        custom_stop_words = ["deleted", "also", "one", "would"]  # replace with your custom stop words
        stop_words.update(custom_stop_words)

        # Preprocess the text and remove stop words
        texts = group['body'].apply(lambda text: [word for word in simple_preprocess(text) if word not in stop_words])
        
        # Update the dictionary
        dictionary.add_documents(texts)
        dictionary.filter_extremes(no_below=20, no_above=0.5)
        
        # Create the BoW representation
        bow_texts = [dictionary.doc2bow(text) for text in texts]
        
        # Add the BoW representation to the corpus
        corpus.extend(bow_texts)
        
        # Add the number of documents to the timeslices
        timeslices.append(len(bow_texts))


    # Now you can create your DTM model
    dtm_path =  "D:\\dtm-master\\dtm-master\\bin\\dtm-win64.exe" # replace with the path to your DTM binary
    model = DtmModel(dtm_path, corpus, timeslices, num_topics=4, id2word=dictionary, initialize_lda=True)
    
    
    # Get the topic proportions for all documents
    topic_proportions = model.gamma_ / np.sum(model.gamma_, axis=1, keepdims=True)

    # Initialize a list to hold the topic sizes for each year
    topic_sizes_per_year = []

    # Initialize a counter for the current document
    doc_counter = 0

    # For each year, sum up the topic proportions
    for year, num_docs in enumerate(timeslices):
        # Initialize a list to hold the topic sizes for this year
        topic_sizes = [0] * num_topics

        # Sum up the topic proportions for this year
        for _ in range(num_docs):
            doc_topics = topic_proportions[doc_counter]
            for topic, proportion in enumerate(doc_topics):
                topic_sizes[topic] += proportion
            doc_counter += 1

        # Add the topic sizes for this year to the list
        topic_sizes_per_year.append(topic_sizes)


    # Convert the topic sizes to proportions
    topic_proportions_per_year = topic_sizes_per_year / np.sum(topic_sizes_per_year, axis=1, keepdims=True)
    plt.figure(figsize=(15, 10))
    # Create a line plot for each topic
    for i in range(num_topics):
        plt.plot(range(len(topic_proportions_per_year)), topic_proportions_per_year[:, i], label='Topic {}'.format(i))

    

    years = list(range(df['year'].unique().min(), 2024))  # replace with your actual years

    # Set x-ticks
    plt.xticks(range(len(years)), years)
    # Add labels and a legend
    plt.xlabel('Year')
    plt.ylabel('Proportion')
    plt.legend()
    plt.title("Topic Proportions Over Time for 'Rails to Trails' in Detroit Subreddits")

    # Show the plot
    plt.show()

    return model
In [520]:
model = plot_topic_proportions(det_comments_df)
No description has been provided for this image
In [584]:
df = meta_comments_df

df = df.sort_values('year')

# Group DataFrame by year
grouped = df.groupby('year')
    
# Initialize a list to hold the corpus and a list to hold the timeslices
corpus = []
timeslices = []

# Initialize a dictionary
dictionary = Dictionary()

# For each year, create a BoW representation and add it to the corpus
for year, group in grouped:
    # Preprocess the text
    # Define the stop words
    stop_words = set(stopwords.words('english'))
    # Add custom stop words
    custom_stop_words = ["deleted", "also", "one", "would"]  # replace with your custom stop words
    stop_words.update(custom_stop_words)

    # Preprocess the text and remove stop words
    texts = group['body'].apply(lambda text: [word for word in simple_preprocess(text) if word not in stop_words])
    
    # Update the dictionary
    dictionary.add_documents(texts)
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    
    # Create the BoW representation
    bow_texts = [dictionary.doc2bow(text) for text in texts]
    
    # Add the BoW representation to the corpus
    corpus.extend(bow_texts)
    
    # Add the number of documents to the timeslices
    timeslices.append(len(bow_texts))


# Now you can create your DTM model
dtm_path =  "D:\\dtm-master\\dtm-master\\bin\\dtm-win64.exe" # replace with the path to your DTM binary
#model = DtmModel(dtm_path, corpus, timeslices, num_topics=4, id2word=dictionary, initialize_lda=True)

model = meta_model

# Get the topic proportions for all documents
topic_proportions = model.gamma_ / np.sum(model.gamma_, axis=1, keepdims=True)

# Initialize a list to hold the topic sizes for each year
topic_sizes_per_year = []

# Initialize a counter for the current document
doc_counter = 0

# For each year, sum up the topic proportions
for year, num_docs in enumerate(timeslices):
    # Initialize a list to hold the topic sizes for this year
    topic_sizes = [0] * num_topics

    # Sum up the topic proportions for this year
    for _ in range(num_docs):
        doc_topics = topic_proportions[doc_counter]
        for topic, proportion in enumerate(doc_topics):
            topic_sizes[topic] += proportion
        doc_counter += 1

    # Add the topic sizes for this year to the list
    topic_sizes_per_year.append(topic_sizes)


# Convert the topic sizes to proportions
topic_proportions_per_year = topic_sizes_per_year / np.sum(topic_sizes_per_year, axis=1, keepdims=True)
plt.figure(figsize=(15, 10))
# Create a line plot for each topic
for i in range(num_topics):
    plt.plot(range(len(topic_proportions_per_year)), topic_proportions_per_year[:, i], label='Topic {}'.format(i))



years = list(range(df['year'].unique().min(), 2024))  # replace with your actual years

# Set x-ticks
plt.xticks(range(len(years)), years)
# Add labels and a legend
plt.xlabel('Year')
plt.ylabel('Proportion')
plt.legend()
plt.title("Topic Proportions Over Time for 'Rails to Trails' in NYC Subreddits")

# Show the plot
plt.show()
No description has been provided for this image
In [591]:
plt.figure(figsize=(15, 10))
# Create a line plot for each topic

plt.plot(range(len(topic_proportions_per_year)), topic_proportions_per_year[:, 0], label='Information Posting')

plt.plot(range(len(topic_proportions_per_year)), topic_proportions_per_year[:, 3], label='Multimodal Transit')


plt.plot(range(len(topic_proportions_per_year)), topic_proportions_per_year[:, 2], label='Economic Development/Infrastructure')



plt.plot(range(len(topic_proportions_per_year)), topic_proportions_per_year[:, 1], label="Placemaking")





years = list(range(df['year'].unique().min(), 2024))  # replace with your actual years

#change color of plot
plt.style.use('fivethirtyeight')

# assign custom names to legend labels
plt.legend(loc='upper left', )

# Set x-ticks
plt.xticks(range(len(years)), years)
# Add labels and a legend
plt.xlabel('Year')
plt.ylabel('Proportion')
plt.legend(loc='upper left')
plt.title("Topic Proportions Over Time for 'Rails to Trails' for entire corpus")

# Show the plot
plt.show()
No description has been provided for this image
In [590]:
model.show_topic(topicid=3
                 , time=9, topn=20)
Out[590]:
[(0.04429769524558069, 'city'),
 (0.04202375573002787, 'beltline'),
 (0.023344593344082742, 'atlanta'),
 (0.020655153091442304, 'new'),
 (0.02028979327523703, 'rail'),
 (0.01935652307739147, 'transit'),
 (0.01613978784394198, 'line'),
 (0.012177213665260398, 'trail'),
 (0.009712531931528, 'use'),
 (0.009030917662413469, 'high'),
 (0.008874117811358152, 'like'),
 (0.008768733525692005, 'light'),
 (0.008459559136150657, 'could'),
 (0.008069534386038552, 'way'),
 (0.007660525180423653, 'marta'),
 (0.007603056218386421, 'area'),
 (0.007212756881948139, 'parking'),
 (0.007136734079466126, 'much'),
 (0.006844744611030045, 'already'),
 (0.006582797540984433, 'along')]
In [276]:
def get_best_model(df):
    print("Model 0")
    model1 = plot_topic_proportions(df)

    print("Model 1")
    model2 = plot_topic_proportions(df)

    print("Model 2")
    model3 = plot_topic_proportions(df)

    print("Model 3")
    model4 = plot_topic_proportions(df)

    print("Model 4")
    model5 = plot_topic_proportions(df)

    print("Model 5")
    model6 = plot_topic_proportions(df)

    print("Model 6")
    model7 = plot_topic_proportions(df)
    
    print("Model 7")
    model8 = plot_topic_proportions(df)

    print("Model 8")
    model9 = plot_topic_proportions(df)

    print("Model 9")
    model10 = plot_topic_proportions(df)

    return model1, model2, model3, model4, model5, model6, model7, model8, model9, model10
In [ ]:
 
In [299]:
atl_models = get_best_model(atl_comments_df)
Model 0
No description has been provided for this image
Model 1
No description has been provided for this image
Model 2
No description has been provided for this image
Model 3
No description has been provided for this image
Model 4
No description has been provided for this image
Model 5
No description has been provided for this image
Model 6
No description has been provided for this image
Model 7
No description has been provided for this image
Model 8
No description has been provided for this image
Model 9
No description has been provided for this image
In [300]:
chi_models = get_best_model(chi_comments_df)
Model 0
No description has been provided for this image
Model 1
No description has been provided for this image
Model 2
No description has been provided for this image
Model 3
No description has been provided for this image
Model 4
No description has been provided for this image
Model 5
No description has been provided for this image
Model 6
No description has been provided for this image
Model 7
No description has been provided for this image
Model 8
No description has been provided for this image
Model 9
No description has been provided for this image
In [301]:
nyc_models = get_best_model(nyc_comments_df)
Model 0
No description has been provided for this image
Model 1
No description has been provided for this image
Model 2
No description has been provided for this image
Model 3
No description has been provided for this image
Model 4
No description has been provided for this image
Model 5
No description has been provided for this image
Model 6
No description has been provided for this image
Model 7
No description has been provided for this image
Model 8
No description has been provided for this image
Model 9
No description has been provided for this image
In [302]:
det_models = get_best_model(det_comments_df)
Model 0
No description has been provided for this image
Model 1
No description has been provided for this image
Model 2
No description has been provided for this image
Model 3
No description has been provided for this image
Model 4
No description has been provided for this image
Model 5
No description has been provided for this image
Model 6
No description has been provided for this image
Model 7
No description has been provided for this image
Model 8
No description has been provided for this image
Model 9
No description has been provided for this image
In [293]:
test1[0].show_topic(topicid=3, time=10, topn=20)
Out[293]:
[(0.050884287828172774, 'people'),
 (0.028741898098231047, 'get'),
 (0.02839650861818371, 'like'),
 (0.01802276613760102, 'beltline'),
 (0.01799808257173386, 'car'),
 (0.01672860290056774, 'bike'),
 (0.016640745086420824, 'think'),
 (0.015368317015978869, 'go'),
 (0.015074327364254618, 'even'),
 (0.01452465327313849, 'going'),
 (0.013851807943289041, 'traffic'),
 (0.01374793021862558, 'know'),
 (0.0133177172481232, 'city'),
 (0.013040407085140584, 'way'),
 (0.012919422877098964, 'see'),
 (0.012900611434012403, 'much'),
 (0.012150721695327542, 'time'),
 (0.012094802776459116, 'make'),
 (0.01199889201943913, 'really'),
 (0.011965432682179077, 'good')]
In [292]:
test1[0].show_topic(topicid=2, time=10, topn=20)
Out[292]:
[(0.030423282379145072, 'years'),
 (0.02292489209308346, 'like'),
 (0.014409694079112995, 'new'),
 (0.012533187186286978, 'time'),
 (0.011813046075489388, 'beltline'),
 (0.011594810081146722, 'going'),
 (0.011519443788173753, 'yeah'),
 (0.011422820790023379, 'area'),
 (0.011404265263676851, 'right'),
 (0.011194476026926529, 'still'),
 (0.010268628999872522, 'never'),
 (0.010063071390182192, 'last'),
 (0.009954921551542827, 'back'),
 (0.009947208983544101, 'construction'),
 (0.009896432409268912, 'get'),
 (0.009671937814033334, 'place'),
 (0.009285040063435045, 'great'),
 (0.009147854191757343, 'see'),
 (0.00911478223765521, 'two'),
 (0.008965890250879757, 'storage')]
In [294]:
test1[0].show_topic(topicid=1, time=10, topn=20)
Out[294]:
[(0.03282955453052184, 'city'),
 (0.026348448858668314, 'people'),
 (0.018785071198887837, 'housing'),
 (0.016793737724053518, 'atlanta'),
 (0.014002851673527593, 'beltline'),
 (0.013338788600194837, 'like'),
 (0.011919826269919125, 'tax'),
 (0.01118677558234561, 'want'),
 (0.010559051193868218, 'development'),
 (0.009964101682397478, 'land'),
 (0.009165361369734873, 'areas'),
 (0.009134852734340991, 'live'),
 (0.008786461042227165, 'affordable'),
 (0.008734457899167325, 'build'),
 (0.008628965831779509, 'going'),
 (0.008349469052726368, 'new'),
 (0.008087118999461549, 'even'),
 (0.00794636165022539, 'money'),
 (0.00787561894790095, 'need'),
 (0.0074408747654093, 'building')]
In [295]:
test1[0].show_topic(topicid=0, time=10, topn=20)
Out[295]:
[(0.062964814922689, 'beltline'),
 (0.0408094064728117, 'rail'),
 (0.03779431920941372, 'trail'),
 (0.027085708922457527, 'transit'),
 (0.021234676332164217, 'https'),
 (0.020328874549374928, 'park'),
 (0.018537740544802735, 'atlanta'),
 (0.01659373107568897, 'com'),
 (0.01583369471929736, 'light'),
 (0.014588373723010875, 'marta'),
 (0.013486216410385698, 'way'),
 (0.011525327563386593, 'line'),
 (0.011400827124292369, 'like'),
 (0.011254697540777186, 'path'),
 (0.010755748765592548, 'along'),
 (0.00996492073840293, 'route'),
 (0.009924143188402523, 'www'),
 (0.00916066045192279, 'streetcar'),
 (0.008983982929636342, 'project'),
 (0.00890679627928527, 'much')]
In [90]:
# get wordcloud for each city based on negative and postive labels
for city in city_df['city_group'].unique():
    # Combine all comments for the city into a single string if bert_label is positive
    text = ' '.join(city_df[(city_df['city_group'] == city) & (city_df["sentiment"] > 0)]['body'])
    # Create and generate a word cloud image
    wordcloud = WordCloud(max_font_size=100, max_words=100, background_color="white").generate(text)
    
    # Display the generated image
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'Positive Word Cloud for {city}')
    plt.show()


    # Combine all comments for the city into a single string if bert_label is positive
    text = ' '.join(city_df[(city_df['city_group'] == city) & (city_df["sentiment"] < 0)]['body'])
    # Create and generate a word cloud image
    wordcloud = WordCloud(max_font_size=100, max_words=100, background_color="white").generate(text)
    
    # Display the generated image
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'Negative Word Cloud for {city}')
    plt.show()


    # Combine all comments for the city into a single string if bert_label is positive
    text = ' '.join(city_df[(city_df['city_group'] == city) & (city_df["sentiment"] == 0)]['body'])
    # Create and generate a word cloud image
    wordcloud = WordCloud(max_font_size=100, max_words=100, background_color="white").generate(text)
    
    # Display the generated image
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f'Neutral Word Cloud for {city}')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Semantic Interaction Network¶

In [429]:
test_df = meta_comments_df[meta_comments_df["year"] == 2012]
In [430]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt


# Create the graph
G = nx.Graph()

# Add nodes with sentiment as an attribute
for index, row in test_df.iterrows():
    G.add_node(row['comment_id'], sentiment=row['sentiment'])

# Add edges based on parent_id
for index, row in test_df.iterrows():
    if pd.notna(row['parent_id']) and row['parent_id'] in G:
        G.add_edge(row['comment_id'], row['parent_id'], arrows="to")

# Visualization
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)  # positions for all nodes
nx.draw(G, pos, node_size=20, node_color=[data['sentiment'] for _, data in G.nodes(data=True)])
plt.show()

# Step 2: Analyze the Network
# Calculating basic network properties
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
average_degree = sum(dict(G.degree()).values()) / num_nodes

# Calculating degree centrality
degree_centrality = nx.degree_centrality(G)
top_n_central_nodes = sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:10]

# Basic Network Metrics
network_info = {
    "Number of Nodes": num_nodes,
    "Number of Edges": num_edges,
    "Average Degree": average_degree,
    "Top 20 Central Nodes": top_n_central_nodes
}

network_info
No description has been provided for this image
Out[430]:
{'Number of Nodes': 688,
 'Number of Edges': 474,
 'Average Degree': 1.377906976744186,
 'Top 20 Central Nodes': ['c5x3gio',
  'c5vlcaw',
  'c5vm429',
  'c5x1or5',
  'c5x9c3a',
  'c3izicl',
  'c5x0y2y',
  'c5x5bho',
  'c5vc0ix',
  'c5vemsz']}
In [64]:
# List of subreddit names
subreddit_names = ['Atlanta', 'Detroit', 'fuckcars', 'urbanplanning', 'bicycling',
                   'bicycletouring', 'bikecommuting', 'urbandesign', 'yimby',
                   'chicago', 'nyc', 'Urbanism', 'chibike', 'NYCbike', 'BikeDetroit',
                   'parks', 'MicromobilityNYC', 'urban', 'left_urbanism']

# List of shapes
shapes = ['circle', 'ellipse', 'box', 'circle', 'database', 'diamond', 'dot', 'star', 'triangle', 'triangleDown', 'hexagon', 'square']

# Create a mapping from subreddit names to shapes
shape_mapping = {subreddit: shapes[i % len(shapes)] for i, subreddit in enumerate(subreddit_names)}

The semantic interaction network final code¶

In [436]:
from pyvis.network import Network
import matplotlib.colors as mcolors

# fix this part
test_df['score'] = (test_df['score'] - test_df['score'].min()) / (test_df['score'].max() - test_df['score'].min()) * 90 + 10


def sentiment_to_color(sentiment, colormap=plt.cm.RdBu):
    # Normalize the sentiment score to be between 0 and 1
    normalized = (sentiment + 1) / 2  # Assuming sentiment scores are between -1 and 1
    return mcolors.rgb2hex(colormap(normalized))

# Convert to Pyvis Network
net = Network(notebook=True, height="900px", width="100%")
net.from_nx(G)


# Apply color map to nodes
for node in net.nodes:
    node['color'] = sentiment_to_color(node['sentiment'])
    node['size'] = float(test_df[test_df["comment_id"] == node['id']]["score"])
    # use shapes to distinguish subreddits
    #node['shape'] = shape_mapping.get(test_df[test_df["comment_id"] == node['id']]["subreddit_name"].values[0])


# add directional edges
options = """ 
{
  "physics": {
    "repulsion": {
      "centralGravity": 0,
      "springLength": 40
    },
    "minVelocity": 0.75,
    "solver": "repulsion",
    "timestep": 0.97
  }
} 
"""

net.set_options(options)

#net.show_buttons(filter_ =["physics"])
# Save and show the network
output_path = 'senti_net1.html'
net.show(output_path)
C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2648037784.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\2648037784.py:21: FutureWarning:

Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead

Warning: When  cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.
senti_net1.html
Out[436]:
In [457]:
meta_comments_df['time'].max()
Out[457]:
'2023-11-17 12:06:22'

Get degree sentiment from Graph¶

In [108]:
def plot_degreeSentiment(G):
    sentiments_by_degree = {}
    for node in G.nodes():
        degree = G.degree(node)
        sentiment = G.nodes[node]['sentiment']  # Assuming sentiment is stored in each node
        if degree not in sentiments_by_degree:
            sentiments_by_degree[degree] = []
        sentiments_by_degree[degree].append(sentiment)

    # Calculate mean sentiment for each degree
    mean_sentiments_by_degree = {degree: sum(sentiments) / len(sentiments) for degree, sentiments in sentiments_by_degree.items()}

    # Print or process the mean sentiments by degree
    for degree, mean_sentiment in mean_sentiments_by_degree.items():
        print(f"Degree {degree}: Mean Sentiment = {mean_sentiment}")

    # Normalize mean sentiment values to a 0-1 range for colormap
    min_sentiment = min(mean_sentiments_by_degree.values())
    max_sentiment = max(mean_sentiments_by_degree.values())
    normalized_sentiments = {degree: (sentiment - min_sentiment) / (max_sentiment - min_sentiment)
                            for degree, sentiment in mean_sentiments_by_degree.items()}

    # Create a colormap (choose a colormap that suits your preference)
    cmap = plt.cm.get_cmap('viridis')

    # Get the degree distribution
    degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
    degree_count = dict((x, degree_sequence.count(x)) for x in set(degree_sequence))
    deg, cnt = zip(*degree_count.items())

    # Map the mean sentiment to colors
    colors = [cmap(normalized_sentiments[d]) if d in normalized_sentiments else cmap(0) for d in deg]

    # Plot the histogram with colored bars
    plt.bar(deg, cnt, width=0.80, color=colors)

    plt.title("Degree Histogram Colored by Mean Sentiment")
    plt.ylabel("Count")
    plt.xlabel("Degree")
    plt.xticks([d for d in deg])
    plt.colorbar(plt.cm.ScalarMappable(cmap=cmap), ax=plt.gca(), label='Normalized Mean Sentiment')
    plt.show()


plot_degreeSentiment(G)
Degree 1: Mean Sentiment = 0.23357829457364335
Degree 2: Mean Sentiment = 0.12506257668711654
Degree 3: Mean Sentiment = 0.17986086956521735
Degree 0: Mean Sentiment = 0.25653947368421054
Degree 4: Mean Sentiment = 0.48352222222222224
Degree 5: Mean Sentiment = 0.2732
Degree 6: Mean Sentiment = -0.4329
C:\Users\Mohsin\AppData\Local\Temp\ipykernel_32836\677481020.py:24: MatplotlibDeprecationWarning:

The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.

No description has been provided for this image
In [ ]:
atl_models[5].show_topic(topicid=2, time=9, topn=100)
# word cloud  for this
In [615]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
topicid = 0
data_list = atl_models[5].show_topic(topicid=topicid, time=9, topn=150)
data_dict = {word: float(prob) for prob, word in data_list}

wordcloud = WordCloud(width=900, height=450,
                      max_font_size=100, max_words=200, background_color="white", colormap="cividis").generate_from_frequencies(data_dict)



plt.figure(figsize=(20, 16))
plt.imshow(wordcloud)
plt.axis("off")
plt.title(f'Word Cloud for Topic {topicid}')
plt.show()

# avoid blurry wordclouds
# https://stackoverflow.com/questions/44661566/how-to-make-the-word-cloud-picture-clear
No description has been provided for this image
In [601]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
topicid = 3
data_list = meta_model.show_topic(topicid=topicid, time=9, topn=150)
data_dict = {word: float(prob) for prob, word in data_list}

wordcloud = WordCloud(max_font_size=100, max_words=200, background_color="white", colormap="cividis").generate_from_frequencies(data_dict)



plt.figure(figsize=(20, 16))
plt.imshow(wordcloud)
plt.axis("off")
plt.title(f'Word Cloud for Topic {topicid}')
plt.show()
No description has been provided for this image
In [474]:
atl_models[5].show_topic(topicid=3, time=9, topn=100)
Out[474]:
[(0.05844119105740144, 'beltline'),
 (0.05466473569739699, 'trail'),
 (0.03604989343964853, 'https'),
 (0.03253518312243404, 'park'),
 (0.027235417154008876, 'com'),
 (0.022368441838793437, 'atlanta'),
 (0.01636261814418857, 'side'),
 (0.015391321472013799, 'www'),
 (0.01473342269511291, 'path'),
 (0.013233188863891888, 'bridge'),
 (0.012011439749579645, 'new'),
 (0.011717784793342617, 'section'),
 (0.011013992994848851, 'construction'),
 (0.010693106566128477, 'eastside'),
 (0.010413218048451856, 'along'),
 (0.009962736539912916, 'westside'),
 (0.00971516152284005, 'east'),
 (0.009557037299291658, 'already'),
 (0.009545588505566777, 'like'),
 (0.008838166731594021, 'going'),
 (0.008790884412933797, 'piedmont'),
 (0.008667952525596073, 'project'),
 (0.008034134682408548, 'southside'),
 (0.007908697701404743, 'part'),
 (0.007600298158174717, 'years'),
 (0.007593860115688191, 'area'),
 (0.007374843787818783, 'way'),
 (0.007253390654134072, 'trails'),
 (0.007020278600741335, 'first'),
 (0.0069576265012628365, 'go'),
 (0.0069218312768840684, 'west'),
 (0.006821199282351539, 'data'),
 (0.0068026056373482365, 'space'),
 (0.0067161960619006806, 'org'),
 (0.0065367329452445996, 'get'),
 (0.006268401142093348, 'center'),
 (0.006262680583657894, 'last'),
 (0.006200055561144853, 'development'),
 (0.0061973198397294125, 'done'),
 (0.00586090164181372, 'even'),
 (0.005749929391773604, 'see'),
 (0.005571206064573061, 'still'),
 (0.005463255223981868, 'south'),
 (0.005385380857357536, 'north'),
 (0.00525393552973079, 'end'),
 (0.005149425245851305, 'lot'),
 (0.004970197046122252, 'open'),
 (0.004969081968474673, 'near'),
 (0.004957755340361408, 'next'),
 (0.0049476800825993955, 'pretty'),
 (0.004926913019677279, 'street'),
 (0.004842760476856101, 'built'),
 (0.004759251641816348, 'rail'),
 (0.004599981379887841, 'much'),
 (0.004509979279954723, 'year'),
 (0.004452356022715671, 'right'),
 (0.0043536107645729735, 'bike'),
 (0.004058056813906511, 'time'),
 (0.004015392154665966, 'seems'),
 (0.003959994273215362, 'pedestrian'),
 (0.003866566532333371, 'use'),
 (0.0037764247284880204, 'access'),
 (0.003723863641784312, 'city'),
 (0.003610773423291737, 'around'),
 (0.003580016156242058, 'two'),
 (0.0035654171350392853, 'connect'),
 (0.003317041550234728, 'well'),
 (0.0032838158230090214, 'could'),
 (0.0032762825011768276, 'existing'),
 (0.003260829230206098, 'since'),
 (0.003186176562318973, 'little'),
 (0.003161591861288462, 'old'),
 (0.0031213708645623952, 'looks'),
 (0.0030445286584374494, 'past'),
 (0.003024325761496098, 'long'),
 (0.0029580415750344365, 'map'),
 (0.002884990664977463, 'qts'),
 (0.0028757884134196477, 'actually'),
 (0.0028699788776318964, 'think'),
 (0.0028530805470959463, 'know'),
 (0.0028413207038759446, 'ride'),
 (0.002833797112938042, 'ponce'),
 (0.002819075845511796, 'design'),
 (0.002804341683870535, 'corridor'),
 (0.0027894367092967617, 'maybe'),
 (0.00277320905800515, 'run'),
 (0.002752708286773827, 'expansion'),
 (0.0026624834255665844, 'station'),
 (0.002652178812675382, 'article'),
 (0.0026230897501906165, 'line'),
 (0.0026067772873040835, 'high'),
 (0.002604071863451952, 'good'),
 (0.00259774492997314, 'make'),
 (0.0025839483908401107, 'another'),
 (0.002559862476667654, 'property'),
 (0.002535255303208565, 'sections'),
 (0.0025081270232322757, 'segment'),
 (0.0024654959891910687, 'neighborhoods'),
 (0.0024544693059763097, 'though'),
 (0.0024406463124270568, 'look')]
In [423]:
meta_model = plot_topic_proportions(meta_comments_df)
No description has been provided for this image
In [428]:
meta_model.show_topic(topicid=3, time=10, topn=100)
Out[428]:
[(0.036084685580264245, 'beltline'),
 (0.030671172631195415, 'city'),
 (0.018432243797217144, 'atlanta'),
 (0.018302159031183207, 'transit'),
 (0.017209632038225647, 'rail'),
 (0.014672745754389887, 'new'),
 (0.013230840522066037, 'line'),
 (0.011846652754452274, 'trail'),
 (0.009325157300093935, 'use'),
 (0.008943949047062858, 'like'),
 (0.008240278224503494, 'way'),
 (0.008148340280944657, 'could'),
 (0.007623908926495465, 'light'),
 (0.0075991772969959985, 'marta'),
 (0.0074979748663711624, 'parking'),
 (0.007199175488195809, 'area'),
 (0.007006964079698056, 'much'),
 (0.006883093884155512, 'high'),
 (0.006647273994097553, 'already'),
 (0.006356099705110454, 'along'),
 (0.00633334668631757, 'park'),
 (0.0062481588503685405, 'right'),
 (0.006084179481688337, 'downtown'),
 (0.006010708230608893, 'built'),
 (0.006000909558582486, 'even'),
 (0.0057761627493714465, 'going'),
 (0.00570849589236683, 'public'),
 (0.005509890857208341, 'build'),
 (0.005483426842328759, 'project'),
 (0.0054459195023882625, 'part'),
 (0.005207712614150265, 'people'),
 (0.005188684296222436, 'get'),
 (0.005187940770882158, 'think'),
 (0.005174792197369418, 'streetcar'),
 (0.0050414607560305535, 'space'),
 (0.004998232116793009, 'still'),
 (0.004990244537631356, 'lot'),
 (0.004838303742843326, 'development'),
 (0.004592176685629839, 'bus'),
 (0.004221342448793771, 'around'),
 (0.004192476771209723, 'plan'),
 (0.004171385713392406, 'better'),
 (0.004076154664736977, 'areas'),
 (0.0040657661587404905, 'well'),
 (0.004001261218070515, 'street'),
 (0.003778298697025204, 'service'),
 (0.003768312171037916, 'train'),
 (0.0037581123240363356, 'station'),
 (0.00369571363961841, 'infrastructure'),
 (0.0036924668894547253, 'make'),
 (0.00358107547753386, 'traffic'),
 (0.0035622727501903747, 'route'),
 (0.0035198566086233713, 'need'),
 (0.003503327621821249, 'path'),
 (0.003462285586334362, 'actually'),
 (0.0033856270405981377, 'construction'),
 (0.0032809016626653784, 'go'),
 (0.003164999652929693, 'years'),
 (0.003164503422334512, 'really'),
 (0.003142006884986047, 'system'),
 (0.0030382446558800276, 'though'),
 (0.003009901685796345, 'work'),
 (0.00299415514713903, 'see'),
 (0.0029785457718285347, 'trails'),
 (0.002909015904759354, 'good'),
 (0.0028779856468406444, 'building'),
 (0.002834501624272892, 'take'),
 (0.002747953255409679, 'near'),
 (0.002697713973541614, 'transportation'),
 (0.002691794281060314, 'want'),
 (0.0026885573928124304, 'existing'),
 (0.0026709770235178315, 'center'),
 (0.0026318034392668684, 'access'),
 (0.0026267394064216415, 'many'),
 (0.002614780448282727, 'point'),
 (0.0025804699140035236, 'car'),
 (0.0025634715832629656, 'done'),
 (0.0025260507102596377, 'pedestrian'),
 (0.0024882403729890785, 'land'),
 (0.0024787016427181476, 'urban'),
 (0.0024288635093301565, 'used'),
 (0.0024049731917024334, 'loop'),
 (0.0023977019039638538, 'planning'),
 (0.002332427700051583, 'funding'),
 (0.0022823030391486746, 'tracks'),
 (0.002269337389764444, 'side'),
 (0.002248911732802077, 'freeway'),
 (0.0022285759228013575, 'design'),
 (0.0022035274016651034, 'housing'),
 (0.0021871418671312332, 'mile'),
 (0.0021842912012409736, 'money'),
 (0.0021841997602142383, 'time'),
 (0.002163107828364081, 'next'),
 (0.002146247635966125, 'long'),
 (0.002126008429043354, 'two'),
 (0.002054969008379992, 'first'),
 (0.0020431144230902056, 'needs'),
 (0.002023130879536212, 'far'),
 (0.0019886230505066762, 'walk'),
 (0.0019873299914373384, 'place')]
In [365]:
fltrd_users_df["comments_count"].describe()
Out[365]:
count    5427.000000
mean        5.661692
std        61.579629
min         2.000000
25%         2.000000
50%         3.000000
75%         5.000000
max      4484.000000
Name: comments_count, dtype: float64
In [447]:
indepth_df = []

for idx, row in meta_comments_df.iterrows():
    user = row['author']
    text = row['body']
    # convert row to list
    row = row.tolist()

    if user == "killroy200":
        sentiment = get_sentiment(text)
        row.append(sentiment)
        indepth_df.append(row)

    elif user == "Miser":
        sentiment = get_sentiment(text)
        row.append(sentiment)
        indepth_df.append(row)


    elif user == "ArchEast":
        sentiment = get_sentiment(text)
        row.append(sentiment)
        indepth_df.append(row)


indepth_df = pd.DataFrame(indepth_df)
indepth_df.columns = ["comment_id","parent_id","body","author","time","score","subreddit_id","thread_id","year","subreddit_name","sentiment","bert","senti"]
In [448]:
model = plot_topic_proportions(indepth_df)
No description has been provided for this image
In [453]:
model.show_topic(topicid=3, time=9, topn=100)
Out[453]:
[(0.06175160639482629, 'people'),
 (0.060913594044230586, 'city'),
 (0.05640891265408597, 'even'),
 (0.051435203203808864, 'parking'),
 (0.04425251828076413, 'housing'),
 (0.03375099117093492, 'like'),
 (0.03189546355584927, 'park'),
 (0.031536780069013774, 'want'),
 (0.03059204913203206, 'need'),
 (0.02830493695410986, 'built'),
 (0.0281785445817948, 'still'),
 (0.02806067628172667, 'car'),
 (0.026302621571713588, 'beltline'),
 (0.025122912955520552, 'atlanta'),
 (0.024906046137978515, 'things'),
 (0.024532696158956008, 'development'),
 (0.022985687874016524, 'much'),
 (0.02278796637416926, 'new'),
 (0.02277483948901064, 'enough'),
 (0.022453984652214046, 'many'),
 (0.021842161786326336, 'use'),
 (0.02128324345814107, 'cars'),
 (0.019945834825396597, 'time'),
 (0.01985772751785559, 'already'),
 (0.01902171005045237, 'work'),
 (0.01892696717542726, 'around'),
 (0.016860360611486606, 'get'),
 (0.015960348399720728, 'well'),
 (0.015308696553985398, 'way'),
 (0.014014159240618062, 'going'),
 (0.0132271199280601, 'really'),
 (0.013130425389720318, 'corridor'),
 (0.011525902508047131, 'good'),
 (0.009088902274724132, 'right'),
 (0.00895999075813205, 'know'),
 (0.008662358883645287, 'better'),
 (0.008416419195893777, 'see'),
 (0.007821270454906408, 'take'),
 (0.007354298314495529, 'transit'),
 (0.007160442705003321, 'make'),
 (0.006434119304952279, 'yeah'),
 (0.006042942710892258, 'though'),
 (0.005531651991795271, 'pretty'),
 (0.004833418142823026, 'something'),
 (0.004044328585956175, 'go'),
 (0.0035562953999385316, 'could'),
 (0.0030890166733221155, 'bike'),
 (0.0028831160331583617, 'think'),
 (0.002761710245471077, 'https'),
 (0.0008767573357241311, 'trail'),
 (0.0008767573357241311, 'marta'),
 (0.0008767573357241311, 'route'),
 (0.0008767573357241311, 'rail')]

Other dataviz¶

Alt text

Alt text

Alt text