Source code for class_factory.concept_web.ConceptWeb

"""
**ConceptWeb Module**
-----------------------

The `ConceptWeb` module provides tools to automatically extract, analyze, and visualize key concepts from lesson materials, helping to identify connections across topics and lessons. Central to this module is the `ConceptMapBuilder` class, which leverages a language model (LLM) to identify and structure important ideas and relationships from lesson readings and objectives into a graph-based representation.

Key functionalities of the module include:

- **Concept Extraction**:
    - Identifies key concepts from lesson readings and objectives using an LLM.
    - Summarizes and highlights main themes from each lesson's content.

- **Relationship Mapping**:
    - Extracts and maps relationships between identified concepts based on lesson objectives and content.
    - Facilitates understanding of how topics interrelate within and across lessons.

- **Graph-Based Visualization**:
    - Constructs a concept map in which nodes represent concepts and edges represent relationships.
    - Generates both interactive graph-based visualizations (HTML) and word clouds for key concepts.

- **Community Detection**:
    - Groups closely related concepts into thematic clusters.
    - Helps identify broader themes or subtopics within the lesson materials.

- **Data Saving**:
    - Optionally saves intermediate data (concepts and relationships) as JSON files for further review or analysis.

Dependencies
~~~~~~~~~~~~~

This module depends on:

- `langchain_core`: For LLM-based extraction and summarization tasks.
- `networkx`: For graph generation and analysis of concept relationships.
- `matplotlib` or `plotly`: For creating visualizations and word clouds.
- Custom utilities for loading documents, extracting objectives, and handling logging.

Usage Overview
~~~~~~~~~~~~~~

1. **Initialize ConceptMapBuilder**:
   - Instantiate `ConceptMapBuilder` with paths to project directories, reading materials, and the syllabus file.

2. **Generate the Concept Map**:
   - Use `build_concept_map()` to process lesson materials, extract and summarize concepts, map relationships, and generate visualizations.

3. **Save and Review**:
   - The generated concept map can be saved as an interactive HTML file or as a static word cloud for easier review and analysis.

Example
~~~~~~~~

.. code-block:: python

    from class_factory.concept_web.ConceptMapBuilder import ConceptMapBuilder
    from class_factory.utils.load_documents import LessonLoader
    from langchain_openai import ChatOpenAI

    # Set up paths and initialize components
    syllabus_path = Path("/path/to/syllabus.docx")
    reading_dir = Path("/path/to/lesson/readings")
    project_dir = Path("/path/to/project")
    llm = ChatOpenAI(api_key="your_api_key")

    # Initialize the lesson loader and concept map builder
    lesson_loader = LessonLoader(syllabus_path=syllabus_path, reading_dir=reading_dir, project_dir=project_dir)
    concept_map_builder = ConceptMapBuilder(
        lesson_no=1,
        lesson_loader=lesson_loader,
        llm=llm,
        course_name="Sample Course",
        lesson_range=range(1, 5)
    )

    # Build and visualize the concept map
    concept_map_builder.build_concept_map()


"""

# %%
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

# parser setup
from langchain_core.output_parsers import JsonOutputParser
from tqdm import tqdm

# self-made conceptweb functions
from class_factory.concept_web.build_concept_map import (build_graph,
                                                         detect_communities)
from class_factory.concept_web.concept_extraction import (
    extract_concepts_from_relationships, extract_relationships,
    process_relationships, summarize_text)
from class_factory.concept_web.prompts import (relationship_prompt,
                                               summary_prompt)
from class_factory.concept_web.visualize_graph import \
    visualize_graph_interactive
from class_factory.utils.base_model import BaseModel
from class_factory.utils.load_documents import LessonLoader

# %%


[docs] class ConceptMapBuilder(BaseModel): """ Orchestrates the extraction, analysis, and visualization of key concepts and their relationships from lesson materials. Uses a language model (LLM) to summarize content, extract relationships, and build a graph-based concept map. Provides methods for processing lessons, saving intermediate data, and generating interactive visualizations. """ def __init__(self, lesson_no: int, lesson_loader: LessonLoader, llm, course_name: str, output_dir: Union[str, Path] = None, lesson_range: Union[range, int] = None, lesson_objectives: Union[List[str], Dict[str, str]] = None, verbose: bool = False, save_relationships: bool = False, **kwargs): """ Initialize the ConceptMapBuilder with configuration for concept map generation. Args: lesson_no (int): Current lesson number. lesson_loader (LessonLoader): Loader for lesson materials. llm (Any): Language model instance. course_name (str): Name of the course. output_dir (Union[str, Path], optional): Output directory for results. lesson_range (Union[range, int], optional): Range of lessons to process. lesson_objectives (Union[List[str], Dict[str, str]], optional): User-defined lesson objectives. verbose (bool, optional): Enable verbose logging. Defaults to False. save_relationships (bool, optional): Save relationships as JSON. Defaults to False. **kwargs: Additional custom prompt parameters. """ # Initialize BaseModel with shared attributes super().__init__(lesson_no=lesson_no, course_name=course_name, lesson_loader=lesson_loader, output_dir=output_dir, verbose=verbose) # ...existing code... # other setup self.llm = llm self.course_name = course_name self.lesson_range = range(lesson_range, lesson_range + 1) if isinstance(lesson_range, int) else lesson_range self.save_relationships = save_relationships self.relationship_list = [] self.concept_list = [] self.prompts = {'summary': kwargs.get('summary_prompt', summary_prompt), 'relationship': kwargs.get('relationship_prompt', relationship_prompt)} self.verbose = verbose self.timestamp = datetime.now().strftime("%Y%m%d") # set output directory rng = [min(self.lesson_range), max(self.lesson_range)] if not output_dir: self.output_dir = Path(self.lesson_loader.project_dir) / \ f"ClassFactoryOutput/ConceptWeb/L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}" else: self.output_dir = Path(output_dir) / f"L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}" self.output_dir.mkdir(parents=True, exist_ok=True) # load user objectives and readings self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {} self.G = None self.readings = self._load_readings(self.lesson_range) self.kwargs = kwargs def _summarize_document(self, document: str) -> str: """ Summarize a single document using the LLM and summary prompt. Args: document (str): Document content to summarize. Returns: str: Summarized content. """ return summarize_text(document, prompt=self.prompts['summary'], course_name=self.course_name, llm=self.llm) def _extract_relationships(self, summary: str, objectives: str) -> List[Tuple[str, str, str]]: """ Extract relationships between concepts from a summary and objectives using the LLM. Args: summary (str): Summarized document content. objectives (str): Lesson objectives for context. Returns: List[Tuple[str, str, str]]: List of (concept1, relation, concept2) tuples. """ return extract_relationships(summary, objectives, self.course_name, llm=self.llm, verbose=self.verbose)
[docs] def load_and_process_lessons(self, threshold: float = 0.995): """ Process lesson materials by summarizing content and extracting concept relationships for each lesson. Args: threshold (float, optional): Similarity threshold for extracted concepts. Defaults to 0.995. For each lesson in `lesson_range`: - Load documents and objectives. - Summarize readings using the LLM. - Extract relationships between concepts and generates unique concept list. """ total_lessons = len(self.readings) total_documents = sum(len(readings) for readings in self.readings.values()) self.logger.info(f"\n=== Starting Concept Extraction from {self.lesson_loader.reading_dir} ===\n") self.logger.info(f"Processing {total_lessons} lesson(s) with {total_documents} total document(s)\n") # Initialize a new structure to hold readings and summaries self.readings_with_summaries = {} # summarize readings with progress bar processed_docs = 0 for lesson_idx, (lesson, readings) in enumerate(tqdm(self.readings.items(), desc="Processing lessons", unit="lesson"), 1): lesson_num = int(lesson) if not int(lesson_num) in self.lesson_range: self.logger.info(f"Lesson {lesson_num} not in provided lesson range. Skipping this reading. " "If this is an error, adjust provided lesson_range") continue self.logger.info(f"[{lesson_idx}/{total_lessons}] Processing Lesson {lesson_num} ({len(readings)} document(s))") lesson_objectives = self._get_lesson_objectives(lesson_num) # Initialize a list to hold summaries for this lesson summaries = [] for document in readings: summary = self._summarize_document(document) summaries.append(summary) # Store the summary relationships = self._extract_relationships(summary, lesson_objectives) concepts = extract_concepts_from_relationships(relationships) self.relationship_list.extend(relationships) self.concept_list.extend(concepts) # Store both readings and summaries in the new structure self.readings_with_summaries[lesson] = { 'readings': readings, 'summaries': summaries } # Process relationships to normalize concepts initial_relationships = len(self.relationship_list) initial_concepts = len(self.concept_list) self.logger.info(f"\n=== Processing and Normalizing Extracted Data ===\n") self.logger.info(f"Initial extraction: {initial_relationships} relationships, {initial_concepts} concepts") self.relationship_list = process_relationships(self.relationship_list, threshold=threshold) self.concept_list = list(set(self.concept_list)) # Ensure unique concepts final_relationships = len(self.relationship_list) final_concepts = len(self.concept_list) self.logger.info(f"After processing: {final_relationships} relationships, {final_concepts} unique concepts") self.logger.info(f"=== Concept Extraction Complete ===\n")
def _save_intermediate_data(self): """ Save extracted concepts and relationships as JSON files in the output directory. Triggered if `save_relationships` is True. Files saved: - `conceptlist_<timestamp>_Lsn_<lesson_range>.json`: List of unique concepts. - `relationship_list_<timestamp>_Lsn_<lesson_range>.json`: List of relationships. Raises: OSError: If saving files fails. """ with open(self.output_dir / f'conceptlist_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f: json.dump(self.concept_list, f) with open(self.output_dir / f'relationship_list_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f: json.dump(self.relationship_list, f) def _build_graph(self, method: str = 'leiden', directed: bool = False): """ Build and visualize a concept map graph, including community detection and word cloud generation. Args: method (str, optional): Community detection method ('leiden', 'louvain', 'spectral'). Defaults to 'leiden'. directed (bool, optional): If True, creates a directed graph. Defaults to False. dark_mode (bool, optional): Use dark background. Defaults to True. Raises: ValueError: If an unrecognized community detection method is used. """ self.logger.info("\nBuilding graph...") self.G = build_graph(processed_relationships=self.relationship_list, directed=directed) self.logger.info("\nDetecting communities...") # Skip community detection if there's only one lesson if len(self.lesson_range) <= 1: self.logger.info("\nSingle lesson detected. Skipping community detection.") # Assign all nodes to a single community for node in self.G.nodes: self.G.nodes[node]["community"] = 0 # Assign all nodes to community 0 else: self.logger.info("\nDetecting communities...") if method not in ['leiden', 'louvain', 'spectral']: raise ValueError("Community detection method not recognized. Please select from 'leiden', 'louvain', or 'spectral'.") self.G = detect_communities(self.G, method=method) def _visualize_graph(self, directed: bool = False, dark_mode: bool = True, max_nodes: int = 250, centrality_method: str = "degree", expand_neighbors: bool = True): output_html_path = self.output_dir / f"interactive_concept_map_{self.timestamp}_Lsn_{self.lesson_range}.html" visualize_graph_interactive( self.G, output_path=output_html_path, directed=directed, dark_mode=dark_mode, max_nodes=max_nodes, centrality_method=centrality_method, expand_neighbors=expand_neighbors )
[docs] def build_concept_map(self, directed: bool = False, concept_similarity_threshold: float = 0.995, dark_mode: bool = True, lesson_objectives: Optional[Dict[str, str]] = None) -> None: """ Run the full pipeline to generate a concept map and visualization. Args: directed (bool, optional): Whether to create a directed concept map. Defaults to False. concept_similarity_threshold (float, optional): Threshold for concept similarity. Defaults to 0.995. dark_mode (bool, optional): Use dark mode for visualization. Defaults to True. lesson_objectives (Optional[Dict[str, str]], optional): User-provided lesson objectives. Defaults to None. """ self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {} method = self.kwargs.get('method', 'leiden') self.load_and_process_lessons(threshold=concept_similarity_threshold) if self.save_relationships: self._save_intermediate_data() self._build_graph(method=method, directed=directed) self._visualize_graph(directed=directed, dark_mode=dark_mode)
if __name__ == "__main__": import os from pathlib import Path import yaml from dotenv import load_dotenv from langchain_community.llms import Ollama from langchain_openai import ChatOpenAI # env setup from pyprojroot.here import here from class_factory.utils.tools import reset_loggers reset_loggers() load_dotenv() user_home = Path.home() # Path definitions # Path definitions with open("class_config.yaml", "r") as file: config = yaml.safe_load(file) class_config = config['PS211'] slide_dir = user_home / class_config['slideDir'] syllabus_path = user_home / class_config['syllabus_path'] readingDir = user_home / class_config['reading_dir'] is_tabular_syllabus = class_config['is_tabular_syllabus'] projectDir = here() # Example usage llm = ChatOpenAI( model="gpt-4o-mini", temperature=0.3, max_tokens=None, timeout=None, max_retries=2, api_key=os.getenv('openai_key'), organization=os.getenv('openai_org'), ) # llm = Ollama( # model="llama3.1", # temperature=0.1, # ) loader = LessonLoader(syllabus_path=syllabus_path, reading_dir=readingDir, project_dir=projectDir) builder = ConceptMapBuilder( lesson_loader=loader, llm=llm, course_name="American Politics", lesson_no=10, lesson_range=range(1, 11), output_dir=None, verbose=False, ) builder.build_concept_map(directed=True, dark_mode=False) # %%