gtech13's picture
Update app.py
d8c503d verified
Raw
History Blame Contribute Delete
23.9 kB
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import streamlit as st
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sentence_transformers import SentenceTransformer, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from umap import UMAP
from hdbscan import HDBSCAN
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
warnings.filterwarnings("ignore")
# ==========================================
# 1. PAGE CONFIGURATION & MAPPINGS
# ==========================================
st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="expanded")
# Add this right below set_page_config to stop the screen from jumping left/right
st.markdown("""
<style>
/* Force the vertical scrollbar to always show so the page width never changes */
html { overflow-y: scroll; }
/* Prevent horizontal scrolling */
.block-container { max-width: 100%; overflow-x: hidden; }
</style>
""", unsafe_allow_html=True)
EMBEDDING_MAP = {
"MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2",
"MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2",
"Specter2 (Scientific/Academic)": "allenai/specter2_base"
}
POOLING_MAP = {
"Mean (Smooth context)": "mean",
"Max (Sharp keywords)": "max",
"CLS (Classification)": "cls",
"Mean-Max (Combined)": "mean-max"
}
# --- CACHE THE NEURAL NETWORK ---
@st.cache_resource
def load_embedder(model_name, pool_strat):
word_emb = models.Transformer(model_name)
pool_model = models.Pooling(
word_emb.get_word_embedding_dimension(),
pooling_mode_mean_tokens=("mean" in pool_strat),
pooling_mode_max_tokens=("max" in pool_strat),
pooling_mode_cls_token=("cls" in pool_strat)
)
# HARDCODED MATH: Permanently forces L2 Normalization so distance math works flawlessly
return SentenceTransformer(modules=[word_emb, pool_model, models.Normalize()])
# ==========================================
# 2. THE GUIDED UI
# ==========================================
st.title("BERTopic - Topic Modeling Pipeline with Math Visualization")
try:
st.image("pipeline.png", use_container_width=True)
except FileNotFoundError:
pass
st.divider()
st.header("📥 Step 0: Input Data & Core Settings")
st.info("💡 **Preprocessing Note:** You do not need to manually lowercase or strip punctuation. The `CountVectorizer` algorithm and the `Uncased` BERT Neural Networks handle casing and token normalization autonomously at the mathematical level.")
data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True)
sample_abstract = """
Students who registered for the Mapping with Google massive open online course (MOOC)
were asked several questions during the registration process to identify prior
experience with eleven skills as well as their goals for registering for the course.
At the end of the course, we compared students' self reports of goal achievement
with behavioral click-stream analysis. In addition, we assessed how well prior
skill in a subject predicts a student's course completion and found no correlation.
Our research shows that students who completed course activities were more likely
to earn certificates of completion than peers who did not.
"""
raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150)
col_a, col_b = st.columns(2)
with col_a:
n_themes = st.slider("Target Number of Themes", 2, 20, 3)
with col_b:
words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5)
# --- THE VERTICAL CONFIGURATION WIZARD ---
st.header("⚙️ Model Configuration")
with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True):
st.markdown("*💡 **BERTopic Default:** Uses `all-MiniLM-L6-v2` with `Mean` pooling.*")
ui_embedding = st.selectbox("Embedding Model (Override Default):", list(EMBEDDING_MAP.keys()))
ui_pooling = st.selectbox("Pooling Strategy (Override Default):", list(POOLING_MAP.keys()))
with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True):
st.markdown("*💡 **BERTopic Default:** Uses `UMAP` with `Cosine` distance to reduce 384D to 5D space.*")
ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"])
if "UMAP" in ui_algo:
ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"])
else:
ui_metric = "euclidean"
st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.")
with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True):
st.markdown("*💡 **BERTopic Default:** Uses `HDBSCAN` exclusively (which crashes on tiny datasets).*")
st.markdown("""
*The model mathematically draws boundaries around similar sentences. *
* **Primary clustering algorithm (HDBSCAN):** Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds. *(Defaults: min_cluster_size=10)*
* **Fallback clustering algorithm (K-Means):** Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples.
""")
with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True):
st.markdown("*💡 **BERTopic Default:** Uses `Unigrams` (1 word) and does **not** filter redundant dataset noise.*")
ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')")
auto_noise = st.checkbox("Auto-Remove Redundant Noise (max_df)", value=True, help="Mathematically deletes words appearing in >85% of documents.")
with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True):
st.markdown("*💡 **BERTopic Default:** ALWAYS extracts baseline words using **c-TF-IDF** (Word Frequency).*")
ui_extraction = st.selectbox("Apply Advanced Filter on top of c-TF-IDF:", ["None (Base c-TF-IDF only)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"])
if "MMR" in ui_extraction:
mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3)
else:
mmr_diversity = None
st.header("📊 Evaluation Metrics")
eval_metrics = st.multiselect(
"Select KPIs to generate a final report card:",
["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"],
default=["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"]
)
st.divider()
# ==========================================
# 3. ENGINE EXECUTION
# ==========================================
if st.button("🚀 Run Topic Modeling Pipeline", type="primary", use_container_width=True):
if not raw_data or len(raw_data) < 20:
st.error("Please provide more text data!")
st.stop()
with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"):
sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10]
dataset_size = len(sentences)
academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did']
from sklearn.feature_extraction import text
stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise))
vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0)
custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling])
embeddings = custom_embedder.encode(sentences)
is_fallback = False
if dataset_size < 15 or "PCA" in ui_algo:
safe_n_themes = min(n_themes, dataset_size)
dim_model = PCA(n_components=2, random_state=42)
cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42)
reduce_topics = None
is_fallback = True
algo_used = "PCA"
cluster_algo = "K-Means"
else:
dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42)
clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
reduce_topics = n_themes
algo_used = "UMAP"
cluster_algo = "HDBSCAN"
if "MMR" in ui_extraction:
rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme)
elif "KeyBERT" in ui_extraction:
rep_model = KeyBERTInspired(top_n_words=words_per_theme)
else:
rep_model = None
topic_model = BERTopic(
embedding_model=custom_embedder,
umap_model=dim_model,
hdbscan_model=cluster_model,
vectorizer_model=vectorizer_model,
representation_model=rep_model,
nr_topics=reduce_topics,
top_n_words=words_per_theme,
language="english"
)
topics, _ = topic_model.fit_transform(sentences)
# ==========================================
# 4. UI DISPLAY & METRICS
# ==========================================
st.success("Analysis Complete!")
if is_fallback:
if safe_n_themes < n_themes:
st.warning(f"⚠️ **Reduced requested themes from {n_themes} to {safe_n_themes}.**\n\n"
f"*Reason:* BERTopic clusters complete sentences to preserve context. "
f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, "
f"which mathematically breaks the clustering algorithms!")
else:
st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).")
st.markdown("### 🏆 Discovered Themes")
topic_info = topic_model.get_topic_info()
all_words = []
cols = st.columns(3)
col_idx = 0
for t_id in topic_info['Topic']:
if t_id == -1: continue
theme_w = [w[0] for w in topic_model.get_topic(t_id)]
all_words.append(theme_w)
with cols[col_idx % 3]:
st.info(f"**Theme {t_id + 1}**\n\n" + "\n".join([f"🔹 {w}" for w in theme_w]))
col_idx += 1
# --- METRICS CALCULATIONS ---
div_val, npmi_val, umass_val, sil_val = 0.0, 0.0, 0.0, 0.0
div_status, npmi_status, umass_status, sil_status = "Skipped", "Skipped", "Skipped", "Skipped"
u_words_len, t_words_len = 0, 0
if len(eval_metrics) > 0:
with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"):
# 1. Diversity
if "Topic Diversity" in eval_metrics and len(all_words) > 0:
u_words = set([w for t in all_words for w in t])
t_words = sum([len(t) for t in all_words])
u_words_len, t_words_len = len(u_words), t_words
div_val = float(len(u_words) / t_words) if t_words > 0 else 0.0
div_status = f"{div_val:.2f}"
# 2. Coherence Models (NPMI & UMass)
if "NPMI Coherence" in eval_metrics or "UMass Coherence" in eval_metrics:
try:
tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences]
dictionary = corpora.Dictionary(tokenized)
if "NPMI Coherence" in eval_metrics:
cm_npmi = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi')
temp_npmi = cm_npmi.get_coherence()
if np.isnan(temp_npmi):
npmi_status = "N/A (Too few words)"
else:
npmi_val = float(temp_npmi)
npmi_status = f"{npmi_val:.2f}"
if "UMass Coherence" in eval_metrics:
cm_umass = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='u_mass')
temp_umass = cm_umass.get_coherence()
if np.isnan(temp_umass):
umass_status = "N/A (Too few words)"
else:
umass_val = float(temp_umass)
umass_status = f"{umass_val:.2f}"
except Exception:
npmi_status = "Skipped (Data too small)"
umass_status = "Skipped (Data too small)"
# 3. Silhouette Score
if "Silhouette Score" in eval_metrics:
valid_idx = [i for i, t in enumerate(topics) if t != -1]
unique_topics = set([topics[i] for i in valid_idx])
if 1 < len(unique_topics) < len(valid_idx):
sil_val = float(silhouette_score(
np.array([embeddings[i] for i in valid_idx]),
[topics[i] for i in valid_idx],
metric='cosine'
))
sil_status = f"{sil_val:.2f}"
else:
sil_status = "Skipped (Themes need ≥2 sentences)"
# --- RENDER KPI DASHBOARD WITH TOOLTIPS ---
st.markdown("### 📊 Key Performance Indicators (KPI)")
kpi_cols = st.columns(len(eval_metrics))
for idx, metric in enumerate(eval_metrics):
with kpi_cols[idx]:
if metric == "Topic Diversity":
st.metric(
label="Topic Diversity",
value=div_status,
help="Math: Unique Words / Total Words.\nTarget: 1.0 (No redundant words across themes)."
)
elif metric == "NPMI Coherence":
st.metric(
label="NPMI Coherence",
value=npmi_status,
help="Math: Normalized Pointwise Mutual Information.\nCalculates joint probability of words existing together.\nTarget: >0.1"
)
elif metric == "UMass Coherence":
st.metric(
label="UMass Coherence",
value=umass_status,
help="Math: Internal log-conditional probability.\nEvaluates if words co-occur strictly inside your uploaded dataset.\nTarget: Closer to 0."
)
elif metric == "Silhouette Score":
st.metric(
label="Silhouette Score",
value=sil_status,
help="Math: (b - a) / max(a,b).\nMeasures intra-cluster density (a) vs nearest-cluster distance (b).\nTarget: >0.0"
)
# ==========================================
# 5. XAI VISUALIZATION GRAPH (With Live Math & Matrices)
# ==========================================
st.markdown("### 📈 Explainable AI (XAI) Architecture Map")
with st.spinner("Rendering Mathematical Dashboard..."):
sns.set_theme(style="whitegrid")
fig = plt.figure(figsize=(18, 16))
# Safe extraction for the title
pool_title = ui_pooling.split()[0]
rep_title = ui_extraction.split()[0]
fig.suptitle(f"Topic Modeling Mathematical Pipeline\n(Pooling: {pool_title} | Rep: {rep_title})", fontsize=20, fontweight='bold', y=0.98)
# Style for the Math/Data boxes
box_style = dict(boxstyle="round,pad=0.5", facecolor='#f8f9fa', edgecolor='#4b72b8', alpha=0.95, lw=2)
# --------------------------------------------------
# 1. Embeddings & Pooling
# --------------------------------------------------
ax1 = plt.subplot(3, 2, 1)
sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1)
ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold')
ax1.set_ylabel("Sentences (Docs)")
ax1.set_xlabel("Vector Dimensions (First 50 shown)")
# Live Data Extraction
emb_shape = embeddings.shape
emb_sample = np.round(embeddings[0, :5], 3).tolist() # First 5 numbers of Doc 1
math_text_1 = (
r"$\mathbf{Math (Mean Pool):} \quad v = \frac{1}{N} \sum_{i=1}^{N} \text{BERT}(w_i)$" + "\n"
f"Matrix Shape: {emb_shape} (Docs x Dims)\n"
f"Doc 1 [Dims 1-5]: {emb_sample}..."
)
ax1.text(0.5, -0.25, math_text_1, fontsize=11, ha='center', va='top', transform=ax1.transAxes, bbox=box_style)
# --------------------------------------------------
# 2. Geometry (Dimensionality Reduction)
# --------------------------------------------------
ax2 = plt.subplot(3, 2, 2)
reduced_embeddings = topic_model.umap_model.transform(embeddings)
ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k')
ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold')
# Live Data Extraction
red_shape = reduced_embeddings.shape
red_sample = np.round(reduced_embeddings[0, :2], 3).tolist() # X, Y coord of Doc 1
eq_2 = r"$\mathbf{Math (PCA):} \quad Z = X \cdot W_{2D}$" if algo_used == "PCA" else r"$\mathbf{Math (UMAP):} \quad \text{Topological Manifold Approx.}$"
math_text_2 = (
f"{eq_2}\n"
f"Matrix Shape: {red_shape} (Docs x 2D Coordinates)\n"
f"Doc 1 Coordinate: [X: {red_sample[0]}, Y: {red_sample[1]}]"
)
ax2.text(0.5, -0.25, math_text_2, fontsize=11, ha='center', va='top', transform=ax2.transAxes, bbox=box_style)
# --------------------------------------------------
# 3. Clustering
# --------------------------------------------------
ax3 = plt.subplot(3, 2, 3)
ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k')
ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold')
# Live Data Extraction
topic_sample = topics[:5] # Grabs up to the first 5
eq_3 = r"$\mathbf{Math (K-Means):} \quad \arg\min_S \sum ||x_i - \mu_c||^2$" if cluster_algo == "K-Means" else r"$\mathbf{Math (HDBSCAN):} \quad \text{Density} = \frac{1}{\text{core\_dist}(x)}$"
math_text_3 = (
f"{eq_3}\n"
f"Output Array Shape: ({len(topics)},) (1 Label per Doc)\n"
f"First {len(topic_sample)} Doc Assignments: {topic_sample}"
)
ax3.text(0.5, -0.25, math_text_3, fontsize=11, ha='center', va='top', transform=ax3.transAxes, bbox=box_style)
# --------------------------------------------------
# 4. Representation
# --------------------------------------------------
ax4 = plt.subplot(3, 2, 4)
# Safely grab the first valid topic found
valid_topics = [t for t in topic_model.get_topics().keys() if t != -1]
theme_1_data = topic_model.get_topic(valid_topics[0]) if valid_topics else None
if theme_1_data:
words = [x[0] for x in theme_1_data][::-1]
scores = [x[1] for x in theme_1_data][::-1]
ax4.barh(words, scores, color='coral', edgecolor='black')
ax4.set_title(f"STEP 4: Topic Representation ({rep_title})", fontsize=13, fontweight='bold')
# Live Data Extraction
top_word_score = round(scores[-1], 4)
vocab_len = len(vectorizer_model.vocabulary_) if hasattr(vectorizer_model, 'vocabulary_') else 'N/A'
math_text_4 = (
r"$\mathbf{Math (c-TF-IDF):} \quad W_{t,c} = tf_{t,c} \times \log\left(1 + \frac{A}{df_t}\right)$" + "\n"
f"Global Vocab Extracted: {vocab_len} terms\n"
f"Top Word ('{words[-1]}') Score: {top_word_score}"
)
ax4.text(0.5, -0.25, math_text_4, fontsize=11, ha='center', va='top', transform=ax4.transAxes, bbox=box_style)
else:
ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes)
# --------------------------------------------------
# 5. KPI Dashboard (Updated with UMass)
# --------------------------------------------------
ax5 = plt.subplot(3, 2, 5)
ax5.axis('off')
ax5.set_title("STEP 5: Post-Hoc Evaluation Formulas", fontsize=13, fontweight='bold', y=0.95)
dist_used = ui_metric if algo_used == "UMAP" else "euclidean"
# Condensed to fit all 4 metrics beautifully!
kpi_math = (
r"$\mathbf{Diversity:} \quad D = \frac{| \text{Unique} |}{| \text{Total} |}$" + f" [Live: {div_status}]\n\n"
r"$\mathbf{Silhouette:} \quad S = \frac{b - a}{\max(a, b)}$" + f" [Live: {sil_status}]\n\n"
r"$\mathbf{NPMI:} \quad \frac{\log(P(x,y) / P(x)P(y))}{-\log P(x,y)}$" + f" [Live: {npmi_status}]\n\n"
r"$\mathbf{UMass:} \quad \log \frac{P(x,y) + \epsilon}{P(x)}$" + f" [Live: {umass_status}]"
)
ax5.text(0.5, 0.45, kpi_math, fontsize=12, va='center', ha='center',
bbox=dict(boxstyle="square,pad=1.2", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2))
ax5.text(0.5, -0.15, "Math: UMass measures internal dataset logic. NPMI measures external logic.\nSilhouette measures geometric separation.",
fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style)
# --------------------------------------------------
# 6. Summary Matrix Transformations
# --------------------------------------------------
ax6 = plt.subplot(3, 2, 6)
ax6.axis('off')
summary_text = (
"=== THE MATRIX TRANSFORMATION LIFECYCLE ===\n\n"
f"1. Raw Text $\\rightarrow$ {emb_shape} Matrix (Dense Meaning)\n"
f"2. {emb_shape} $\\rightarrow$ {red_shape} Matrix (Geometric Compression)\n"
f"3. {red_shape} $\\rightarrow$ ({len(topics)},) Array (Discrete Bucketing)\n"
f"4. ({len(topics)},) $\\rightarrow$ c-TF-IDF Matrix (Word Extraction)\n"
f"5. c-TF-IDF $\\rightarrow$ {words_per_theme} Output Words (Per Theme)\n\n" # THE BUG FIX!
"This proves Topic Modeling is a sequence of \ndimensionality reductions and matrix multiplications."
)
ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left',
bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2))
plt.subplots_adjust(hspace=0.7, wspace=0.3)
st.pyplot(fig, use_container_width=True)