"""
**ConceptWeb Module**
-----------------------
The `ConceptWeb` module provides tools to automatically extract, analyze, and visualize key concepts from lesson materials, helping to identify connections across topics and lessons. Central to this module is the `ConceptMapBuilder` class, which leverages a language model (LLM) to identify and structure important ideas and relationships from lesson readings and objectives into a graph-based representation.
Key functionalities of the module include:
- **Concept Extraction**:
- Identifies key concepts from lesson readings and objectives using an LLM.
- Summarizes and highlights main themes from each lesson's content.
- **Relationship Mapping**:
- Extracts and maps relationships between identified concepts based on lesson objectives and content.
- Facilitates understanding of how topics interrelate within and across lessons.
- **Graph-Based Visualization**:
- Constructs a concept map in which nodes represent concepts and edges represent relationships.
- Generates both interactive graph-based visualizations (HTML) and word clouds for key concepts.
- **Community Detection**:
- Groups closely related concepts into thematic clusters.
- Helps identify broader themes or subtopics within the lesson materials.
- **Data Saving**:
- Optionally saves intermediate data (concepts and relationships) as JSON files for further review or analysis.
Dependencies
~~~~~~~~~~~~~
This module depends on:
- `langchain_core`: For LLM-based extraction and summarization tasks.
- `networkx`: For graph generation and analysis of concept relationships.
- `matplotlib` or `plotly`: For creating visualizations and word clouds.
- Custom utilities for loading documents, extracting objectives, and handling logging.
Usage Overview
~~~~~~~~~~~~~~
1. **Initialize ConceptMapBuilder**:
- Instantiate `ConceptMapBuilder` with paths to project directories, reading materials, and the syllabus file.
2. **Generate the Concept Map**:
- Use `build_concept_map()` to process lesson materials, extract and summarize concepts, map relationships, and generate visualizations.
3. **Save and Review**:
- The generated concept map can be saved as an interactive HTML file or as a static word cloud for easier review and analysis.
Example
~~~~~~~~
.. code-block:: python
from class_factory.concept_web.ConceptMapBuilder import ConceptMapBuilder
from class_factory.utils.load_documents import LessonLoader
from langchain_openai import ChatOpenAI
# Set up paths and initialize components
syllabus_path = Path("/path/to/syllabus.docx")
reading_dir = Path("/path/to/lesson/readings")
project_dir = Path("/path/to/project")
llm = ChatOpenAI(api_key="your_api_key")
# Initialize the lesson loader and concept map builder
lesson_loader = LessonLoader(syllabus_path=syllabus_path, reading_dir=reading_dir, project_dir=project_dir)
concept_map_builder = ConceptMapBuilder(
lesson_no=1,
lesson_loader=lesson_loader,
llm=llm,
course_name="Sample Course",
lesson_range=range(1, 5)
)
# Build and visualize the concept map
concept_map_builder.build_concept_map()
"""
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
# parser setup
from langchain_core.output_parsers import JsonOutputParser
# self-made conceptweb functions
from class_factory.concept_web.build_concept_map import (build_graph,
detect_communities)
from class_factory.concept_web.concept_extraction import (
extract_concepts_from_relationships, extract_relationships,
process_relationships, summarize_text)
from class_factory.concept_web.prompts import (relationship_prompt,
summary_prompt)
from class_factory.concept_web.visualize_graph import (
generate_wordcloud, visualize_graph_interactive)
from class_factory.utils.base_model import BaseModel
from class_factory.utils.load_documents import LessonLoader
from class_factory.utils.response_parsers import Extracted_Relations
# general utils
from class_factory.utils.tools import logger_setup
# %%
[docs]
class ConceptMapBuilder(BaseModel):
"""
Generate concept maps (a form of knowledge graph) from lesson materials, using a language model (LLM) to summarize content,
extract relationships, and visualize concepts in a structured graph format.
This class provides end-to-end functionality for concept map creation, including loading readings,
summarizing content, extracting concept relationships, constructing graphs, and generating
interactive and visual outputs like word clouds.
Attributes:
lesson_no (int): Current lesson number being processed.
lesson_loader (LessonLoader): Loader instance for handling lesson materials.
llm (Any): Language model instance for summarization and relationship extraction.
course_name (str): Course name, used as context in LLM prompts.
output_dir (Path): Directory for saving generated outputs.
lesson_range (range): Range of lessons to process.
save_relationships (bool): Whether to save extracted relationships to JSON.
relationship_list (List[Tuple[str, str, str]]): List of concept relationships.
concept_list (List[str]): List of unique concepts extracted.
prompts (Dict[str, str]): Dictionary of prompts for LLM tasks.
verbose (bool): Whether to enable verbose logging.
G (Optional[nx.Graph]): Generated concept graph.
user_objectives (Dict[str, str]): User-defined lesson objectives.
Methods:
load_and_process_lessons(threshold: float = 0.995):
Loads lesson materials, summarizes content, and extracts relationships between concepts.
build_concept_map(directed: bool = False, concept_similarity_threshold: float = 0.995,
dark_mode: bool = True, lesson_objectives: Optional[Dict[str, str]] = None):
Runs the concept map generation pipeline and outputs visualizations.
"""
def __init__(self, lesson_no: int, lesson_loader: LessonLoader, llm, course_name: str,
output_dir: Union[str, Path] = None, lesson_range: Union[range, int] = None,
lesson_objectives: Union[List[str], Dict[str, str]] = None,
verbose: bool = False, save_relationships: bool = False, **kwargs):
# Initialize BaseModel with shared attributes
super().__init__(lesson_no=lesson_no, course_name=course_name, lesson_loader=lesson_loader,
output_dir=output_dir, verbose=verbose)
"""
Initialize the ConceptMapBuilder with paths and configurations for concept map generation.
Args:
project_dir (Union[str, Path]): Project directory path.
readings_dir (Union[str, Path]): Directory path for lesson readings.
syllabus_path (Union[str, Path]): Path to syllabus document (.pdf or .docx).
llm (Any): Language model instance for text summarization and relationship extraction.
course_name (str): Name of the course.
output_dir (Union[str, Path], optional): Output directory path for generated concept map.
lesson_range (Union[range, int], optional): Range of lesson numbers to process.
lesson_objectives (Union[List[str], Dict[str, str]], optional): User-defined lesson objectives.
verbose (bool, optional): If True, enables verbose logging.
save_relationships (bool, optional): If True, saves concept relationships as JSON.
**kwargs: Additional parameters for custom prompts.
"""
# other setup
self.llm = llm
self.course_name = course_name
self.lesson_range = range(lesson_range, lesson_range + 1) if isinstance(lesson_range, int) else lesson_range
self.save_relationships = save_relationships
self.relationship_list = []
self.concept_list = []
self.prompts = {'summary': kwargs.get('summary_prompt', summary_prompt),
'relationship': kwargs.get('relationship_prompt', relationship_prompt)}
self.verbose = verbose
self.timestamp = datetime.now().strftime("%Y%m%d")
# set output directory
rng = [min(self.lesson_range), max(self.lesson_range)]
if not output_dir:
self.output_dir = Path(self.lesson_loader.project_dir) / \
f"ClassFactoryOutput/ConceptWeb/L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}"
else:
self.output_dir = Path(output_dir) / f"L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}"
self.output_dir.mkdir(parents=True, exist_ok=True)
# load user objectives and readings
self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {}
self.G = None
self.readings = self._load_readings(self.lesson_range)
self.kwargs = kwargs
def _summarize_document(self, document: str) -> str:
"""
Summarizes a single document using the LLM.
Args:
document (str): Document content to summarize.
Returns:
str: Summarized content.
"""
return summarize_text(document, prompt=self.prompts['summary'], course_name=self.course_name, llm=self.llm)
def _extract_relationships(self, summary: str, objectives: str) -> List[Tuple[str, str, str]]:
"""
Extracts relationships between concepts from a summary and objectives.
Args:
summary (str): Summarized document content.
objectives (str): Lesson objectives for context.
Returns:
List[Tuple[str, str, str]]: List of relationships as (concept1, relation, concept2) tuples.
"""
return extract_relationships(summary, objectives, self.course_name, llm=self.llm, verbose=self.verbose)
[docs]
def load_and_process_lessons(self, threshold: float = 0.995):
"""
Process lesson materials by summarizing content and extracting concept relationships.
Args:
threshold (float, optional): Similarity threshold for extracted concepts. Defaults to 0.995.
For each lesson in `lesson_range`:
- Load documents and objectives.
- Summarize readings using the LLM.
- Extract relationships between concepts and generates unique concept list.
"""
self.logger.info(f"\nLoading lessons from {self.lesson_loader.reading_dir}...")
# summarize readings
for lesson, readings in self.readings.items():
lesson_num = int(lesson)
if not int(lesson_num) in self.lesson_range:
self.logger.info(f"Lesson {lesson_num} not provided lesson range. Skipping this reading. "
"If this is an error, adjust provided lesson_range")
continue
lesson_objectives = self._get_lesson_objectives(lesson_num)
for document in readings:
summary = self._summarize_document(document)
relationships = self._extract_relationships(summary, lesson_objectives)
self.relationship_list.extend(relationships)
concepts = extract_concepts_from_relationships(relationships)
self.concept_list.extend(concepts)
# Process relationships to normalize concepts
self.logger.info("\nExtracting concepts and relations")
self.relationship_list = process_relationships(self.relationship_list, threshold=threshold)
self.concept_list = list(set(self.concept_list)) # Ensure unique concepts
def _save_intermediate_data(self):
"""
Saves extracted concepts and relationships as JSON files in the output directory. (Triggered if `save_relationships`=True)
Files saved:
- `conceptlist_<timestamp>_Lsn_<lesson_range>.json`: List of unique concepts.
- `relationship_list_<timestamp>_Lsn_<lesson_range>.json`: List of relationships.
Raises:
OSError: If saving files fails.
"""
with open(self.output_dir / f'conceptlist_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f:
json.dump(self.concept_list, f)
with open(self.output_dir / f'relationship_list_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f:
json.dump(self.relationship_list, f)
def _build_and_visualize_graph(self, method: str = 'leiden', directed: bool = False, dark_mode: bool = True):
"""
Construct and visualize a concept map graph, including community detection and word cloud generation.
Steps:
- Builds a concept graph where nodes represent concepts and edges represent relationships.
- Detects communities based on the specified `method` ('leiden', 'louvain', or 'spectral').
- Generates an HTML visualization and word cloud.
Args:
method (str, optional): Community detection method. Defaults to 'leiden'.
directed (bool, optional): If True, creates a directed graph. Defaults to False.
dark_mode (bool): Sets graph to dark or white background. Defaults to True (dark mode).
Raises:
ValueError: If an unrecognized community detection method is used.
"""
self.logger.info("\nBuilding graph...")
self.G = build_graph(processed_relationships=self.relationship_list, directed=directed)
self.logger.info("\nDetecting communities...")
# Skip community detection if there's only one lesson
if len(self.lesson_range) <= 1:
self.logger.info("\nSingle lesson detected. Skipping community detection.")
# Assign all nodes to a single community
for node in self.G.nodes:
self.G.nodes[node]["community"] = 0 # Assign all nodes to community 0
else:
self.logger.info("\nDetecting communities...")
if method not in ['leiden', 'louvain', 'spectral']:
raise ValueError("Community detection method not recognized. Please select from 'leiden', 'louvain', or 'spectral'.")
self.G = detect_communities(self.G, method=method)
output_html_path = self.output_dir / f"interactive_concept_map_{self.timestamp}_Lsn_{self.lesson_range}.html"
visualize_graph_interactive(self.G, output_path=output_html_path, directed=directed, dark_mode=dark_mode)
wordcloud_path = self.output_dir / f"concept_wordcloud_{self.timestamp}_Lsn_{self.lesson_range}.png"
generate_wordcloud(self.concept_list, output_path=wordcloud_path)
[docs]
def build_concept_map(self, directed: bool = False, concept_similarity_threshold: float = 0.995,
dark_mode: bool = True, lesson_objectives: Optional[Dict[str, str]] = None) -> None:
"""
Execute the full pipeline to generate a concept map.
Args:
directed (bool, optional): Whether to create a directed concept map. Defaults to False.
concept_similarity_threshold (float, optional): Threshold for concept similarity. Defaults to 0.995.
dark_mode (bool, optional): Whether to use dark mode for visualization. Defaults to True.
lesson_objectives (Optional[Dict[str, str]], optional): User-provided lesson objectives. Defaults to None.
Raises:
ValueError: If any process encounters invalid data.
"""
self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {}
method = self.kwargs.get('method', 'leiden')
self.load_and_process_lessons(threshold=concept_similarity_threshold)
if self.save_relationships:
self._save_intermediate_data()
self._build_and_visualize_graph(method=method, directed=directed, dark_mode=dark_mode)
if __name__ == "__main__":
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.llms import Ollama
from langchain_openai import ChatOpenAI
# env setup
from pyprojroot.here import here
from class_factory.utils.tools import reset_loggers
reset_loggers()
load_dotenv()
user_home = Path.home()
# Path definitions
projectDir = here()
readingDir = user_home / os.getenv('readingsDir')
syllabus_path = user_home / os.getenv('syllabus_path')
# pdf_syllabus_path = user_home / os.getenv('pdf_syllabus_path')
# Example usage
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.3,
max_tokens=None,
timeout=None,
max_retries=2,
api_key=os.getenv('openai_key'),
organization=os.getenv('openai_org'),
)
# llm = Ollama(
# model="llama3.1",
# temperature=0.1,
# )
loader = LessonLoader(syllabus_path=syllabus_path,
reading_dir=readingDir,
project_dir=projectDir)
builder = ConceptMapBuilder(
lesson_loader=loader,
llm=llm,
course_name="American Politics",
lesson_no=21,
lesson_range=range(19, 21),
output_dir=None,
verbose=False
)
builder.build_concept_map(directed=False, dark_mode=False)