Spaces:

gtech13
/

Topic-Modeling-BERTopic-Math-Visualization

Sleeping

App Files Files Community

Topic-Modeling-BERTopic-Math-Visualization / app.py

gtech13

Update app.py

d8c503d verified 24 days ago

Raw

History Blame Contribute Delete

23.9 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	import streamlit as st
	import pandas as pd
	import warnings
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	from bertopic import BERTopic
	from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
	from sentence_transformers import SentenceTransformer, models
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import PCA
	from sklearn.cluster import KMeans
	from sklearn.metrics import silhouette_score
	from umap import UMAP
	from hdbscan import HDBSCAN
	import gensim.corpora as corpora
	from gensim.models.coherencemodel import CoherenceModel

	warnings.filterwarnings("ignore")

	# ==========================================
	# 1. PAGE CONFIGURATION & MAPPINGS
	# ==========================================
	st.set_page_config(page_title="Topic Modeling Pipeline", layout="wide", initial_sidebar_state="expanded")

	# Add this right below set_page_config to stop the screen from jumping left/right
	st.markdown("""
	<style>
	/* Force the vertical scrollbar to always show so the page width never changes */
	html { overflow-y: scroll; }
	/* Prevent horizontal scrolling */
	.block-container { max-width: 100%; overflow-x: hidden; }
	</style>
	""", unsafe_allow_html=True)

	EMBEDDING_MAP = {
	"MiniLM (Fast & Lightweight)": "sentence-transformers/all-MiniLM-L6-v2",
	"MPNet (High Accuracy)": "sentence-transformers/all-mpnet-base-v2",
	"Specter2 (Scientific/Academic)": "allenai/specter2_base"
	}

	POOLING_MAP = {
	"Mean (Smooth context)": "mean",
	"Max (Sharp keywords)": "max",
	"CLS (Classification)": "cls",
	"Mean-Max (Combined)": "mean-max"
	}

	# --- CACHE THE NEURAL NETWORK ---
	@st.cache_resource
	def load_embedder(model_name, pool_strat):
	word_emb = models.Transformer(model_name)
	pool_model = models.Pooling(
	word_emb.get_word_embedding_dimension(),
	pooling_mode_mean_tokens=("mean" in pool_strat),
	pooling_mode_max_tokens=("max" in pool_strat),
	pooling_mode_cls_token=("cls" in pool_strat)
	)

	# HARDCODED MATH: Permanently forces L2 Normalization so distance math works flawlessly
	return SentenceTransformer(modules=[word_emb, pool_model, models.Normalize()])

	# ==========================================
	# 2. THE GUIDED UI
	# ==========================================
	st.title("BERTopic - Topic Modeling Pipeline with Math Visualization")

	try:
	st.image("pipeline.png", use_container_width=True)
	except FileNotFoundError:
	pass

	st.divider()

	st.header("📥 Step 0: Input Data & Core Settings")
	st.info("💡 Preprocessing Note: You do not need to manually lowercase or strip punctuation. The `CountVectorizer` algorithm and the `Uncased` BERT Neural Networks handle casing and token normalization autonomously at the mathematical level.")

	data_source = st.radio("Choose Data Source:", ["Use Sample ACM Abstract", "Paste Text"], horizontal=True)

	sample_abstract = """
	Students who registered for the Mapping with Google massive open online course (MOOC)
	were asked several questions during the registration process to identify prior
	experience with eleven skills as well as their goals for registering for the course.
	At the end of the course, we compared students' self reports of goal achievement
	with behavioral click-stream analysis. In addition, we assessed how well prior
	skill in a subject predicts a student's course completion and found no correlation.
	Our research shows that students who completed course activities were more likely
	to earn certificates of completion than peers who did not.
	"""

	raw_data = st.text_area("Text Data:", value=sample_abstract if data_source == "Use Sample ACM Abstract" else "", height=150)

	col_a, col_b = st.columns(2)
	with col_a:
	n_themes = st.slider("Target Number of Themes", 2, 20, 3)
	with col_b:
	words_per_theme = st.slider("Words to Output per Theme", 3, 10, 5)

	# --- THE VERTICAL CONFIGURATION WIZARD ---
	st.header("⚙️ Model Configuration")

	with st.expander("1️⃣ Semantic Layer (Embeddings & Pooling)", expanded=True):
	st.markdown("💡 BERTopic Default:* Uses `all-MiniLM-L6-v2` with `Mean` pooling.*")
	ui_embedding = st.selectbox("Embedding Model (Override Default):", list(EMBEDDING_MAP.keys()))
	ui_pooling = st.selectbox("Pooling Strategy (Override Default):", list(POOLING_MAP.keys()))

	with st.expander("2️⃣ Geometry Layer (Dimensionality Reduction)", expanded=True):
	st.markdown("💡 BERTopic Default:* Uses `UMAP` with `Cosine` distance to reduce 384D to 5D space.*")
	ui_algo = st.selectbox("Algorithm", ["UMAP (Complex geometry)", "PCA (Fast/Deterministic)"])
	if "UMAP" in ui_algo:
	ui_metric = st.selectbox("Distance Metric", ["cosine", "euclidean", "manhattan"])
	else:
	ui_metric = "euclidean"
	st.info("PCA inherently uses Variance (Euclidean math), so distance metrics are bypassed.")

	with st.expander("3️⃣ Clustering Layer (Grouping)", expanded=True):
	st.markdown("💡 BERTopic Default:* Uses `HDBSCAN` exclusively (which crashes on tiny datasets).*")
	st.markdown("""
	The model mathematically draws boundaries around similar sentences.
	* Primary clustering algorithm (HDBSCAN): Runs on datasets $\ge$ 15 sentences. Automatically filters outliers and finds dense semantic clouds. (Defaults: min_cluster_size=10)
	* Fallback clustering algorithm (K-Means): Runs on datasets $<$ 15 sentences. Forces all sentences into buckets to prevent math crashes on tiny text samples.
	""")

	with st.expander("4️⃣ Vocabulary Layer (Vectorization)", expanded=True):
	st.markdown("💡 BERTopic Default:* Uses `Unigrams` (1 word) and does not filter redundant dataset noise.*")
	ngram_range = st.slider("N-Gram Range", 1, 3, (1, 2), help="1=Unigrams, 2=Bigrams (e.g., 'machine learning')")
	auto_noise = st.checkbox("Auto-Remove Redundant Noise (max_df)", value=True, help="Mathematically deletes words appearing in >85% of documents.")

	with st.expander("5️⃣ Extraction Layer (Representation)", expanded=True):
	st.markdown("💡 BERTopic Default:* ALWAYS extracts baseline words using c-TF-IDF (Word Frequency).*")
	ui_extraction = st.selectbox("Apply Advanced Filter on top of c-TF-IDF:", ["None (Base c-TF-IDF only)", "KeyBERTInspired (Semantic cosine)", "MMR (Reduce redundancy)"])
	if "MMR" in ui_extraction:
	mmr_diversity = st.slider("MMR Diversity Penalty", 0.0, 1.0, 0.3)
	else:
	mmr_diversity = None

	st.header("📊 Evaluation Metrics")
	eval_metrics = st.multiselect(
	"Select KPIs to generate a final report card:",
	["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"],
	default=["Topic Diversity", "NPMI Coherence", "UMass Coherence", "Silhouette Score"]
	)

	st.divider()

	# ==========================================
	# 3. ENGINE EXECUTION
	# ==========================================
	if st.button("🚀 Run Topic Modeling Pipeline", type="primary", use_container_width=True):

	if not raw_data or len(raw_data) < 20:
	st.error("Please provide more text data!")
	st.stop()

	with st.spinner("Processing Semantic Pipeline... (Models are cached to prevent crashes)"):

	sentences = [s.strip() for s in raw_data.split('.') if len(s.strip()) > 10]
	dataset_size = len(sentences)

	academic_noise = ['students', 'course', 'research', 'paper', 'found', 'likely', 'did']
	from sklearn.feature_extraction import text
	stop_w = list(text.ENGLISH_STOP_WORDS.union(academic_noise))

	vectorizer_model = CountVectorizer(stop_words=stop_w, ngram_range=ngram_range, max_df=0.85 if auto_noise and dataset_size > 10 else 1.0)

	custom_embedder = load_embedder(EMBEDDING_MAP[ui_embedding], POOLING_MAP[ui_pooling])
	embeddings = custom_embedder.encode(sentences)

	is_fallback = False
	if dataset_size < 15 or "PCA" in ui_algo:
	safe_n_themes = min(n_themes, dataset_size)
	dim_model = PCA(n_components=2, random_state=42)
	cluster_model = KMeans(n_clusters=safe_n_themes, random_state=42)
	reduce_topics = None
	is_fallback = True
	algo_used = "PCA"
	cluster_algo = "K-Means"
	else:
	dim_model = UMAP(n_neighbors=15, n_components=5, metric=ui_metric, random_state=42)
	clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
	reduce_topics = n_themes
	algo_used = "UMAP"
	cluster_algo = "HDBSCAN"

	if "MMR" in ui_extraction:
	rep_model = MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=words_per_theme)
	elif "KeyBERT" in ui_extraction:
	rep_model = KeyBERTInspired(top_n_words=words_per_theme)
	else:
	rep_model = None

	topic_model = BERTopic(
	embedding_model=custom_embedder,
	umap_model=dim_model,
	hdbscan_model=cluster_model,
	vectorizer_model=vectorizer_model,
	representation_model=rep_model,
	nr_topics=reduce_topics,
	top_n_words=words_per_theme,
	language="english"
	)
	topics, _ = topic_model.fit_transform(sentences)

	# ==========================================
	# 4. UI DISPLAY & METRICS
	# ==========================================
	st.success("Analysis Complete!")

	if is_fallback:
	if safe_n_themes < n_themes:
	st.warning(f"⚠️ Reduced requested themes from {n_themes} to {safe_n_themes}.\n\n"
	f"Reason: BERTopic clusters complete sentences to preserve context. "
	f"You cannot sort {dataset_size} sentences into {n_themes} buckets without leaving empty buckets, "
	f"which mathematically breaks the clustering algorithms!")
	else:
	st.info(f"ℹ️ Auto-switched to PCA/K-Means due to small dataset size ({dataset_size} sentences).")

	st.markdown("### 🏆 Discovered Themes")
	topic_info = topic_model.get_topic_info()
	all_words = []

	cols = st.columns(3)
	col_idx = 0
	for t_id in topic_info['Topic']:
	if t_id == -1: continue
	theme_w = [w[0] for w in topic_model.get_topic(t_id)]
	all_words.append(theme_w)
	with cols[col_idx % 3]:
	st.info(f"Theme {t_id + 1}\n\n" + "\n".join([f"🔹 {w}" for w in theme_w]))
	col_idx += 1

	# --- METRICS CALCULATIONS ---
	div_val, npmi_val, umass_val, sil_val = 0.0, 0.0, 0.0, 0.0
	div_status, npmi_status, umass_status, sil_status = "Skipped", "Skipped", "Skipped", "Skipped"
	u_words_len, t_words_len = 0, 0

	if len(eval_metrics) > 0:
	with st.spinner("Calculating mathematical metrics... (NPMI requires building a dictionary and takes a moment)"):

	# 1. Diversity
	if "Topic Diversity" in eval_metrics and len(all_words) > 0:
	u_words = set([w for t in all_words for w in t])
	t_words = sum([len(t) for t in all_words])
	u_words_len, t_words_len = len(u_words), t_words
	div_val = float(len(u_words) / t_words) if t_words > 0 else 0.0
	div_status = f"{div_val:.2f}"

	# 2. Coherence Models (NPMI & UMass)
	if "NPMI Coherence" in eval_metrics or "UMass Coherence" in eval_metrics:
	try:
	tokenized = [vectorizer_model.build_analyzer()(s) for s in sentences]
	dictionary = corpora.Dictionary(tokenized)

	if "NPMI Coherence" in eval_metrics:
	cm_npmi = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='c_npmi')
	temp_npmi = cm_npmi.get_coherence()
	if np.isnan(temp_npmi):
	npmi_status = "N/A (Too few words)"
	else:
	npmi_val = float(temp_npmi)
	npmi_status = f"{npmi_val:.2f}"

	if "UMass Coherence" in eval_metrics:
	cm_umass = CoherenceModel(topics=all_words, texts=tokenized, dictionary=dictionary, coherence='u_mass')
	temp_umass = cm_umass.get_coherence()
	if np.isnan(temp_umass):
	umass_status = "N/A (Too few words)"
	else:
	umass_val = float(temp_umass)
	umass_status = f"{umass_val:.2f}"
	except Exception:
	npmi_status = "Skipped (Data too small)"
	umass_status = "Skipped (Data too small)"

	# 3. Silhouette Score
	if "Silhouette Score" in eval_metrics:
	valid_idx = [i for i, t in enumerate(topics) if t != -1]
	unique_topics = set([topics[i] for i in valid_idx])
	if 1 < len(unique_topics) < len(valid_idx):
	sil_val = float(silhouette_score(
	np.array([embeddings[i] for i in valid_idx]),
	[topics[i] for i in valid_idx],
	metric='cosine'
	))
	sil_status = f"{sil_val:.2f}"
	else:
	sil_status = "Skipped (Themes need ≥2 sentences)"

	# --- RENDER KPI DASHBOARD WITH TOOLTIPS ---
	st.markdown("### 📊 Key Performance Indicators (KPI)")
	kpi_cols = st.columns(len(eval_metrics))

	for idx, metric in enumerate(eval_metrics):
	with kpi_cols[idx]:
	if metric == "Topic Diversity":
	st.metric(
	label="Topic Diversity",
	value=div_status,
	help="Math: Unique Words / Total Words.\nTarget: 1.0 (No redundant words across themes)."
	)
	elif metric == "NPMI Coherence":
	st.metric(
	label="NPMI Coherence",
	value=npmi_status,
	help="Math: Normalized Pointwise Mutual Information.\nCalculates joint probability of words existing together.\nTarget: >0.1"
	)
	elif metric == "UMass Coherence":
	st.metric(
	label="UMass Coherence",
	value=umass_status,
	help="Math: Internal log-conditional probability.\nEvaluates if words co-occur strictly inside your uploaded dataset.\nTarget: Closer to 0."
	)
	elif metric == "Silhouette Score":
	st.metric(
	label="Silhouette Score",
	value=sil_status,
	help="Math: (b - a) / max(a,b).\nMeasures intra-cluster density (a) vs nearest-cluster distance (b).\nTarget: >0.0"
	)
	# ==========================================
	# 5. XAI VISUALIZATION GRAPH (With Live Math & Matrices)
	# ==========================================
	st.markdown("### 📈 Explainable AI (XAI) Architecture Map")

	with st.spinner("Rendering Mathematical Dashboard..."):
	sns.set_theme(style="whitegrid")
	fig = plt.figure(figsize=(18, 16))

	# Safe extraction for the title
	pool_title = ui_pooling.split()[0]
	rep_title = ui_extraction.split()[0]
	fig.suptitle(f"Topic Modeling Mathematical Pipeline\n(Pooling: {pool_title} \| Rep: {rep_title})", fontsize=20, fontweight='bold', y=0.98)

	# Style for the Math/Data boxes
	box_style = dict(boxstyle="round,pad=0.5", facecolor='#f8f9fa', edgecolor='#4b72b8', alpha=0.95, lw=2)

	# --------------------------------------------------
	# 1. Embeddings & Pooling
	# --------------------------------------------------
	ax1 = plt.subplot(3, 2, 1)
	sns.heatmap(embeddings[:, :50], cmap="viridis", cbar=False, ax=ax1)
	ax1.set_title("STEP 1: Embeddings & Pooling", fontsize=13, fontweight='bold')
	ax1.set_ylabel("Sentences (Docs)")
	ax1.set_xlabel("Vector Dimensions (First 50 shown)")

	# Live Data Extraction
	emb_shape = embeddings.shape
	emb_sample = np.round(embeddings[0, :5], 3).tolist() # First 5 numbers of Doc 1

	math_text_1 = (
	r"$\mathbf{Math (Mean Pool):} \quad v = \frac{1}{N} \sum_{i=1}^{N} \text{BERT}(w_i)$" + "\n"
	f"Matrix Shape: {emb_shape} (Docs x Dims)\n"
	f"Doc 1 [Dims 1-5]: {emb_sample}..."
	)
	ax1.text(0.5, -0.25, math_text_1, fontsize=11, ha='center', va='top', transform=ax1.transAxes, bbox=box_style)

	# --------------------------------------------------
	# 2. Geometry (Dimensionality Reduction)
	# --------------------------------------------------
	ax2 = plt.subplot(3, 2, 2)
	reduced_embeddings = topic_model.umap_model.transform(embeddings)
	ax2.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c='grey', s=100, alpha=0.6, edgecolor='k')
	ax2.set_title(f"STEP 2: Geometry ({algo_used})", fontsize=13, fontweight='bold')

	# Live Data Extraction
	red_shape = reduced_embeddings.shape
	red_sample = np.round(reduced_embeddings[0, :2], 3).tolist() # X, Y coord of Doc 1

	eq_2 = r"$\mathbf{Math (PCA):} \quad Z = X \cdot W_{2D}$" if algo_used == "PCA" else r"$\mathbf{Math (UMAP):} \quad \text{Topological Manifold Approx.}$"
	math_text_2 = (
	f"{eq_2}\n"
	f"Matrix Shape: {red_shape} (Docs x 2D Coordinates)\n"
	f"Doc 1 Coordinate: [X: {red_sample[0]}, Y: {red_sample[1]}]"
	)
	ax2.text(0.5, -0.25, math_text_2, fontsize=11, ha='center', va='top', transform=ax2.transAxes, bbox=box_style)

	# --------------------------------------------------
	# 3. Clustering
	# --------------------------------------------------
	ax3 = plt.subplot(3, 2, 3)
	ax3.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=topics, cmap='tab10', s=150, edgecolor='k')
	ax3.set_title(f"STEP 3: Clustering ({cluster_algo})", fontsize=13, fontweight='bold')

	# Live Data Extraction
	topic_sample = topics[:5] # Grabs up to the first 5

	eq_3 = r"$\mathbf{Math (K-Means):} \quad \arg\min_S \sum \|\|x_i - \mu_c\|\|^2$" if cluster_algo == "K-Means" else r"$\mathbf{Math (HDBSCAN):} \quad \text{Density} = \frac{1}{\text{core\_dist}(x)}$"
	math_text_3 = (
	f"{eq_3}\n"
	f"Output Array Shape: ({len(topics)},) (1 Label per Doc)\n"
	f"First {len(topic_sample)} Doc Assignments: {topic_sample}"
	)

	ax3.text(0.5, -0.25, math_text_3, fontsize=11, ha='center', va='top', transform=ax3.transAxes, bbox=box_style)

	# --------------------------------------------------
	# 4. Representation
	# --------------------------------------------------
	ax4 = plt.subplot(3, 2, 4)

	# Safely grab the first valid topic found
	valid_topics = [t for t in topic_model.get_topics().keys() if t != -1]
	theme_1_data = topic_model.get_topic(valid_topics[0]) if valid_topics else None

	if theme_1_data:
	words = [x[0] for x in theme_1_data][::-1]
	scores = [x[1] for x in theme_1_data][::-1]
	ax4.barh(words, scores, color='coral', edgecolor='black')
	ax4.set_title(f"STEP 4: Topic Representation ({rep_title})", fontsize=13, fontweight='bold')

	# Live Data Extraction
	top_word_score = round(scores[-1], 4)
	vocab_len = len(vectorizer_model.vocabulary_) if hasattr(vectorizer_model, 'vocabulary_') else 'N/A'

	math_text_4 = (
	r"$\mathbf{Math (c-TF-IDF):} \quad W_{t,c} = tf_{t,c} \times \log\left(1 + \frac{A}{df_t}\right)$" + "\n"
	f"Global Vocab Extracted: {vocab_len} terms\n"
	f"Top Word ('{words[-1]}') Score: {top_word_score}"
	)
	ax4.text(0.5, -0.25, math_text_4, fontsize=11, ha='center', va='top', transform=ax4.transAxes, bbox=box_style)
	else:
	ax4.text(0.5, 0.5, "Theme not found", ha='center', transform=ax4.transAxes)

	# --------------------------------------------------
	# 5. KPI Dashboard (Updated with UMass)
	# --------------------------------------------------
	ax5 = plt.subplot(3, 2, 5)
	ax5.axis('off')
	ax5.set_title("STEP 5: Post-Hoc Evaluation Formulas", fontsize=13, fontweight='bold', y=0.95)

	dist_used = ui_metric if algo_used == "UMAP" else "euclidean"

	# Condensed to fit all 4 metrics beautifully!
	kpi_math = (
	r"$\mathbf{Diversity:} \quad D = \frac{\| \text{Unique} \|}{\| \text{Total} \|}$" + f" [Live: {div_status}]\n\n"

	r"$\mathbf{Silhouette:} \quad S = \frac{b - a}{\max(a, b)}$" + f" [Live: {sil_status}]\n\n"

	r"$\mathbf{NPMI:} \quad \frac{\log(P(x,y) / P(x)P(y))}{-\log P(x,y)}$" + f" [Live: {npmi_status}]\n\n"

	r"$\mathbf{UMass:} \quad \log \frac{P(x,y) + \epsilon}{P(x)}$" + f" [Live: {umass_status}]"
	)

	ax5.text(0.5, 0.45, kpi_math, fontsize=12, va='center', ha='center',
	bbox=dict(boxstyle="square,pad=1.2", facecolor='#e6f2ff', edgecolor='#377eb8', lw=2))

	ax5.text(0.5, -0.15, "Math: UMass measures internal dataset logic. NPMI measures external logic.\nSilhouette measures geometric separation.",
	fontsize=10, ha='center', va='top', transform=ax5.transAxes, bbox=box_style)

	# --------------------------------------------------
	# 6. Summary Matrix Transformations
	# --------------------------------------------------
	ax6 = plt.subplot(3, 2, 6)
	ax6.axis('off')
	summary_text = (
	"=== THE MATRIX TRANSFORMATION LIFECYCLE ===\n\n"
	f"1. Raw Text $\\rightarrow$ {emb_shape} Matrix (Dense Meaning)\n"
	f"2. {emb_shape} $\\rightarrow$ {red_shape} Matrix (Geometric Compression)\n"
	f"3. {red_shape} $\\rightarrow$ ({len(topics)},) Array (Discrete Bucketing)\n"
	f"4. ({len(topics)},) $\\rightarrow$ c-TF-IDF Matrix (Word Extraction)\n"
	f"5. c-TF-IDF $\\rightarrow$ {words_per_theme} Output Words (Per Theme)\n\n" # THE BUG FIX!
	"This proves Topic Modeling is a sequence of \ndimensionality reductions and matrix multiplications."
	)
	ax6.text(0.1, 0.5, summary_text, fontsize=12, va='center', ha='left',
	bbox=dict(boxstyle="square,pad=1", facecolor='#f0f0f0', edgecolor='grey', lw=2))

	plt.subplots_adjust(hspace=0.7, wspace=0.3)
	st.pyplot(fig, use_container_width=True)