"""
**ConceptWeb Module**
-----------------------
The `ConceptWeb` module provides tools to automatically extract, analyze, and visualize key concepts from lesson materials, helping to identify connections across topics and lessons. Central to this module is the `ConceptMapBuilder` class, which leverages a language model (LLM) to identify and structure important ideas and relationships from lesson readings and objectives into a graph-based representation.
Key functionalities of the module include:
- **Concept Extraction**:
- Identifies key concepts from lesson readings and objectives using an LLM.
- Summarizes and highlights main themes from each lesson's content.
- **Relationship Mapping**:
- Extracts and maps relationships between identified concepts based on lesson objectives and content.
- Facilitates understanding of how topics interrelate within and across lessons.
- **Graph-Based Visualization**:
- Constructs a concept map in which nodes represent concepts and edges represent relationships.
- Generates both interactive graph-based visualizations (HTML) and word clouds for key concepts.
- **Community Detection**:
- Groups closely related concepts into thematic clusters.
- Helps identify broader themes or subtopics within the lesson materials.
- **Data Saving**:
- Optionally saves intermediate data (concepts and relationships) as JSON files for further review or analysis.
Dependencies
~~~~~~~~~~~~~
This module depends on:
- `langchain_core`: For LLM-based extraction and summarization tasks.
- `networkx`: For graph generation and analysis of concept relationships.
- `matplotlib` or `plotly`: For creating visualizations and word clouds.
- Custom utilities for loading documents, extracting objectives, and handling logging.
Usage Overview
~~~~~~~~~~~~~~
1. **Initialize ConceptMapBuilder**:
- Instantiate `ConceptMapBuilder` with paths to project directories, reading materials, and the syllabus file.
2. **Generate the Concept Map**:
- Use `build_concept_map()` to process lesson materials, extract and summarize concepts, map relationships, and generate visualizations.
3. **Save and Review**:
- The generated concept map can be saved as an interactive HTML file or as a static word cloud for easier review and analysis.
Example
~~~~~~~~
.. code-block:: python
from class_factory.concept_web.ConceptMapBuilder import ConceptMapBuilder
from class_factory.utils.load_documents import LessonLoader
from langchain_openai import ChatOpenAI
# Set up paths and initialize components
syllabus_path = Path("/path/to/syllabus.docx")
reading_dir = Path("/path/to/lesson/readings")
project_dir = Path("/path/to/project")
llm = ChatOpenAI(api_key="your_api_key")
# Initialize the lesson loader and concept map builder
lesson_loader = LessonLoader(syllabus_path=syllabus_path, reading_dir=reading_dir, project_dir=project_dir)
concept_map_builder = ConceptMapBuilder(
lesson_no=1,
lesson_loader=lesson_loader,
llm=llm,
course_name="Sample Course",
lesson_range=range(1, 5)
)
# Build and visualize the concept map
concept_map_builder.build_concept_map()
"""
# %%
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
# parser setup
from langchain_core.output_parsers import JsonOutputParser
from tqdm import tqdm
# self-made conceptweb functions
from class_factory.concept_web.build_concept_map import (build_graph,
detect_communities)
from class_factory.concept_web.concept_extraction import (
extract_concepts_from_relationships, extract_relationships,
process_relationships, summarize_text)
from class_factory.concept_web.prompts import (relationship_prompt,
summary_prompt)
from class_factory.concept_web.visualize_graph import \
visualize_graph_interactive
from class_factory.utils.base_model import BaseModel
from class_factory.utils.load_documents import LessonLoader
# %%
[docs]
class ConceptMapBuilder(BaseModel):
"""
Orchestrates the extraction, analysis, and visualization of key concepts and their relationships from lesson materials.
Uses a language model (LLM) to summarize content, extract relationships, and build a graph-based concept map.
Provides methods for processing lessons, saving intermediate data, and generating interactive visualizations.
"""
def __init__(self, lesson_no: int, lesson_loader: LessonLoader, llm, course_name: str,
output_dir: Union[str, Path] = None, lesson_range: Union[range, int] = None,
lesson_objectives: Union[List[str], Dict[str, str]] = None,
verbose: bool = False, save_relationships: bool = False, **kwargs):
"""
Initialize the ConceptMapBuilder with configuration for concept map generation.
Args:
lesson_no (int): Current lesson number.
lesson_loader (LessonLoader): Loader for lesson materials.
llm (Any): Language model instance.
course_name (str): Name of the course.
output_dir (Union[str, Path], optional): Output directory for results.
lesson_range (Union[range, int], optional): Range of lessons to process.
lesson_objectives (Union[List[str], Dict[str, str]], optional): User-defined lesson objectives.
verbose (bool, optional): Enable verbose logging. Defaults to False.
save_relationships (bool, optional): Save relationships as JSON. Defaults to False.
**kwargs: Additional custom prompt parameters.
"""
# Initialize BaseModel with shared attributes
super().__init__(lesson_no=lesson_no, course_name=course_name, lesson_loader=lesson_loader,
output_dir=output_dir, verbose=verbose)
# ...existing code...
# other setup
self.llm = llm
self.course_name = course_name
self.lesson_range = range(lesson_range, lesson_range + 1) if isinstance(lesson_range, int) else lesson_range
self.save_relationships = save_relationships
self.relationship_list = []
self.concept_list = []
self.prompts = {'summary': kwargs.get('summary_prompt', summary_prompt),
'relationship': kwargs.get('relationship_prompt', relationship_prompt)}
self.verbose = verbose
self.timestamp = datetime.now().strftime("%Y%m%d")
# set output directory
rng = [min(self.lesson_range), max(self.lesson_range)]
if not output_dir:
self.output_dir = Path(self.lesson_loader.project_dir) / \
f"ClassFactoryOutput/ConceptWeb/L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}"
else:
self.output_dir = Path(output_dir) / f"L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}"
self.output_dir.mkdir(parents=True, exist_ok=True)
# load user objectives and readings
self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {}
self.G = None
self.readings = self._load_readings(self.lesson_range)
self.kwargs = kwargs
def _summarize_document(self, document: str) -> str:
"""
Summarize a single document using the LLM and summary prompt.
Args:
document (str): Document content to summarize.
Returns:
str: Summarized content.
"""
return summarize_text(document, prompt=self.prompts['summary'], course_name=self.course_name, llm=self.llm)
def _extract_relationships(self, summary: str, objectives: str) -> List[Tuple[str, str, str]]:
"""
Extract relationships between concepts from a summary and objectives using the LLM.
Args:
summary (str): Summarized document content.
objectives (str): Lesson objectives for context.
Returns:
List[Tuple[str, str, str]]: List of (concept1, relation, concept2) tuples.
"""
return extract_relationships(summary, objectives, self.course_name, llm=self.llm, verbose=self.verbose)
[docs]
def load_and_process_lessons(self, threshold: float = 0.995):
"""
Process lesson materials by summarizing content and extracting concept relationships for each lesson.
Args:
threshold (float, optional): Similarity threshold for extracted concepts. Defaults to 0.995.
For each lesson in `lesson_range`:
- Load documents and objectives.
- Summarize readings using the LLM.
- Extract relationships between concepts and generates unique concept list.
"""
total_lessons = len(self.readings)
total_documents = sum(len(readings) for readings in self.readings.values())
self.logger.info(f"\n=== Starting Concept Extraction from {self.lesson_loader.reading_dir} ===\n")
self.logger.info(f"Processing {total_lessons} lesson(s) with {total_documents} total document(s)\n")
# Initialize a new structure to hold readings and summaries
self.readings_with_summaries = {}
# summarize readings with progress bar
processed_docs = 0
for lesson_idx, (lesson, readings) in enumerate(tqdm(self.readings.items(), desc="Processing lessons", unit="lesson"), 1):
lesson_num = int(lesson)
if not int(lesson_num) in self.lesson_range:
self.logger.info(f"Lesson {lesson_num} not in provided lesson range. Skipping this reading. "
"If this is an error, adjust provided lesson_range")
continue
self.logger.info(f"[{lesson_idx}/{total_lessons}] Processing Lesson {lesson_num} ({len(readings)} document(s))")
lesson_objectives = self._get_lesson_objectives(lesson_num)
# Initialize a list to hold summaries for this lesson
summaries = []
for document in readings:
summary = self._summarize_document(document)
summaries.append(summary) # Store the summary
relationships = self._extract_relationships(summary, lesson_objectives)
concepts = extract_concepts_from_relationships(relationships)
self.relationship_list.extend(relationships)
self.concept_list.extend(concepts)
# Store both readings and summaries in the new structure
self.readings_with_summaries[lesson] = {
'readings': readings,
'summaries': summaries
}
# Process relationships to normalize concepts
initial_relationships = len(self.relationship_list)
initial_concepts = len(self.concept_list)
self.logger.info(f"\n=== Processing and Normalizing Extracted Data ===\n")
self.logger.info(f"Initial extraction: {initial_relationships} relationships, {initial_concepts} concepts")
self.relationship_list = process_relationships(self.relationship_list, threshold=threshold)
self.concept_list = list(set(self.concept_list)) # Ensure unique concepts
final_relationships = len(self.relationship_list)
final_concepts = len(self.concept_list)
self.logger.info(f"After processing: {final_relationships} relationships, {final_concepts} unique concepts")
self.logger.info(f"=== Concept Extraction Complete ===\n")
def _save_intermediate_data(self):
"""
Save extracted concepts and relationships as JSON files in the output directory.
Triggered if `save_relationships` is True.
Files saved:
- `conceptlist_<timestamp>_Lsn_<lesson_range>.json`: List of unique concepts.
- `relationship_list_<timestamp>_Lsn_<lesson_range>.json`: List of relationships.
Raises:
OSError: If saving files fails.
"""
with open(self.output_dir / f'conceptlist_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f:
json.dump(self.concept_list, f)
with open(self.output_dir / f'relationship_list_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f:
json.dump(self.relationship_list, f)
def _build_graph(self, method: str = 'leiden', directed: bool = False):
"""
Build and visualize a concept map graph, including community detection and word cloud generation.
Args:
method (str, optional): Community detection method ('leiden', 'louvain', 'spectral'). Defaults to 'leiden'.
directed (bool, optional): If True, creates a directed graph. Defaults to False.
dark_mode (bool, optional): Use dark background. Defaults to True.
Raises:
ValueError: If an unrecognized community detection method is used.
"""
self.logger.info("\nBuilding graph...")
self.G = build_graph(processed_relationships=self.relationship_list, directed=directed)
self.logger.info("\nDetecting communities...")
# Skip community detection if there's only one lesson
if len(self.lesson_range) <= 1:
self.logger.info("\nSingle lesson detected. Skipping community detection.")
# Assign all nodes to a single community
for node in self.G.nodes:
self.G.nodes[node]["community"] = 0 # Assign all nodes to community 0
else:
self.logger.info("\nDetecting communities...")
if method not in ['leiden', 'louvain', 'spectral']:
raise ValueError("Community detection method not recognized. Please select from 'leiden', 'louvain', or 'spectral'.")
self.G = detect_communities(self.G, method=method)
def _visualize_graph(self, directed: bool = False, dark_mode: bool = True, max_nodes: int = 250,
centrality_method: str = "degree", expand_neighbors: bool = True):
output_html_path = self.output_dir / f"interactive_concept_map_{self.timestamp}_Lsn_{self.lesson_range}.html"
visualize_graph_interactive(
self.G,
output_path=output_html_path,
directed=directed,
dark_mode=dark_mode,
max_nodes=max_nodes,
centrality_method=centrality_method,
expand_neighbors=expand_neighbors
)
[docs]
def build_concept_map(self, directed: bool = False, concept_similarity_threshold: float = 0.995,
dark_mode: bool = True, lesson_objectives: Optional[Dict[str, str]] = None) -> None:
"""
Run the full pipeline to generate a concept map and visualization.
Args:
directed (bool, optional): Whether to create a directed concept map. Defaults to False.
concept_similarity_threshold (float, optional): Threshold for concept similarity. Defaults to 0.995.
dark_mode (bool, optional): Use dark mode for visualization. Defaults to True.
lesson_objectives (Optional[Dict[str, str]], optional): User-provided lesson objectives. Defaults to None.
"""
self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {}
method = self.kwargs.get('method', 'leiden')
self.load_and_process_lessons(threshold=concept_similarity_threshold)
if self.save_relationships:
self._save_intermediate_data()
self._build_graph(method=method, directed=directed)
self._visualize_graph(directed=directed, dark_mode=dark_mode)
if __name__ == "__main__":
import os
from pathlib import Path
import yaml
from dotenv import load_dotenv
from langchain_community.llms import Ollama
from langchain_openai import ChatOpenAI
# env setup
from pyprojroot.here import here
from class_factory.utils.tools import reset_loggers
reset_loggers()
load_dotenv()
user_home = Path.home()
# Path definitions
# Path definitions
with open("class_config.yaml", "r") as file:
config = yaml.safe_load(file)
class_config = config['PS211']
slide_dir = user_home / class_config['slideDir']
syllabus_path = user_home / class_config['syllabus_path']
readingDir = user_home / class_config['reading_dir']
is_tabular_syllabus = class_config['is_tabular_syllabus']
projectDir = here()
# Example usage
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.3,
max_tokens=None,
timeout=None,
max_retries=2,
api_key=os.getenv('openai_key'),
organization=os.getenv('openai_org'),
)
# llm = Ollama(
# model="llama3.1",
# temperature=0.1,
# )
loader = LessonLoader(syllabus_path=syllabus_path,
reading_dir=readingDir,
project_dir=projectDir)
builder = ConceptMapBuilder(
lesson_loader=loader,
llm=llm,
course_name="American Politics",
lesson_no=10,
lesson_range=range(1, 11),
output_dir=None,
verbose=False,
)
builder.build_concept_map(directed=True, dark_mode=False)
# %%