Source code for class_factory.concept_web.ConceptWeb

"""
**ConceptWeb Module**
-----------------------

The `ConceptWeb` module provides tools to automatically extract, analyze, and visualize key concepts from lesson materials, helping to identify connections across topics and lessons. Central to this module is the `ConceptMapBuilder` class, which leverages a language model (LLM) to identify and structure important ideas and relationships from lesson readings and objectives into a graph-based representation.

Key functionalities of the module include:

- **Concept Extraction**:
    - Identifies key concepts from lesson readings and objectives using an LLM.
    - Summarizes and highlights main themes from each lesson's content.

- **Relationship Mapping**:
    - Extracts and maps relationships between identified concepts based on lesson objectives and content.
    - Facilitates understanding of how topics interrelate within and across lessons.

- **Graph-Based Visualization**:
    - Constructs a concept map in which nodes represent concepts and edges represent relationships.
    - Generates both interactive graph-based visualizations (HTML) and word clouds for key concepts.

- **Community Detection**:
    - Groups closely related concepts into thematic clusters.
    - Helps identify broader themes or subtopics within the lesson materials.

- **Data Saving**:
    - Optionally saves intermediate data (concepts and relationships) as JSON files for further review or analysis.

Dependencies
~~~~~~~~~~~~~

This module depends on:

- `langchain_core`: For LLM-based extraction and summarization tasks.
- `networkx`: For graph generation and analysis of concept relationships.
- `matplotlib` or `plotly`: For creating visualizations and word clouds.
- Custom utilities for loading documents, extracting objectives, and handling logging.

Usage Overview
~~~~~~~~~~~~~~

1. **Initialize ConceptMapBuilder**:
   - Instantiate `ConceptMapBuilder` with paths to project directories, reading materials, and the syllabus file.

2. **Generate the Concept Map**:
   - Use `build_concept_map()` to process lesson materials, extract and summarize concepts, map relationships, and generate visualizations.

3. **Save and Review**:
   - The generated concept map can be saved as an interactive HTML file or as a static word cloud for easier review and analysis.

Example
~~~~~~~~    

.. code-block:: python

    from class_factory.concept_web.ConceptMapBuilder import ConceptMapBuilder
    from class_factory.utils.load_documents import LessonLoader
    from langchain_openai import ChatOpenAI

    # Set up paths and initialize components
    syllabus_path = Path("/path/to/syllabus.docx")
    reading_dir = Path("/path/to/lesson/readings")
    project_dir = Path("/path/to/project")
    llm = ChatOpenAI(api_key="your_api_key")

    # Initialize the lesson loader and concept map builder
    lesson_loader = LessonLoader(syllabus_path=syllabus_path, reading_dir=reading_dir, project_dir=project_dir)
    concept_map_builder = ConceptMapBuilder(
        lesson_no=1,
        lesson_loader=lesson_loader,
        llm=llm,
        course_name="Sample Course",
        lesson_range=range(1, 5)
    )

    # Build and visualize the concept map
    concept_map_builder.build_concept_map()


"""


import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

# parser setup
from langchain_core.output_parsers import JsonOutputParser

# self-made conceptweb functions
from class_factory.concept_web.build_concept_map import (build_graph,
                                                         detect_communities)
from class_factory.concept_web.concept_extraction import (
    extract_concepts_from_relationships, extract_relationships,
    process_relationships, summarize_text)
from class_factory.concept_web.prompts import (relationship_prompt,
                                               summary_prompt)
from class_factory.concept_web.visualize_graph import (
    generate_wordcloud, visualize_graph_interactive)
from class_factory.utils.base_model import BaseModel
from class_factory.utils.load_documents import LessonLoader
from class_factory.utils.response_parsers import Extracted_Relations
# general utils
from class_factory.utils.tools import logger_setup

# %%


[docs] class ConceptMapBuilder(BaseModel): """ Generate concept maps (a form of knowledge graph) from lesson materials, using a language model (LLM) to summarize content, extract relationships, and visualize concepts in a structured graph format. This class provides end-to-end functionality for concept map creation, including loading readings, summarizing content, extracting concept relationships, constructing graphs, and generating interactive and visual outputs like word clouds. Attributes: lesson_no (int): Current lesson number being processed. lesson_loader (LessonLoader): Loader instance for handling lesson materials. llm (Any): Language model instance for summarization and relationship extraction. course_name (str): Course name, used as context in LLM prompts. output_dir (Path): Directory for saving generated outputs. lesson_range (range): Range of lessons to process. save_relationships (bool): Whether to save extracted relationships to JSON. relationship_list (List[Tuple[str, str, str]]): List of concept relationships. concept_list (List[str]): List of unique concepts extracted. prompts (Dict[str, str]): Dictionary of prompts for LLM tasks. verbose (bool): Whether to enable verbose logging. G (Optional[nx.Graph]): Generated concept graph. user_objectives (Dict[str, str]): User-defined lesson objectives. Methods: load_and_process_lessons(threshold: float = 0.995): Loads lesson materials, summarizes content, and extracts relationships between concepts. build_concept_map(directed: bool = False, concept_similarity_threshold: float = 0.995, dark_mode: bool = True, lesson_objectives: Optional[Dict[str, str]] = None): Runs the concept map generation pipeline and outputs visualizations. """ def __init__(self, lesson_no: int, lesson_loader: LessonLoader, llm, course_name: str, output_dir: Union[str, Path] = None, lesson_range: Union[range, int] = None, lesson_objectives: Union[List[str], Dict[str, str]] = None, verbose: bool = False, save_relationships: bool = False, **kwargs): # Initialize BaseModel with shared attributes super().__init__(lesson_no=lesson_no, course_name=course_name, lesson_loader=lesson_loader, output_dir=output_dir, verbose=verbose) """ Initialize the ConceptMapBuilder with paths and configurations for concept map generation. Args: project_dir (Union[str, Path]): Project directory path. readings_dir (Union[str, Path]): Directory path for lesson readings. syllabus_path (Union[str, Path]): Path to syllabus document (.pdf or .docx). llm (Any): Language model instance for text summarization and relationship extraction. course_name (str): Name of the course. output_dir (Union[str, Path], optional): Output directory path for generated concept map. lesson_range (Union[range, int], optional): Range of lesson numbers to process. lesson_objectives (Union[List[str], Dict[str, str]], optional): User-defined lesson objectives. verbose (bool, optional): If True, enables verbose logging. save_relationships (bool, optional): If True, saves concept relationships as JSON. **kwargs: Additional parameters for custom prompts. """ # other setup self.llm = llm self.course_name = course_name self.lesson_range = range(lesson_range, lesson_range + 1) if isinstance(lesson_range, int) else lesson_range self.save_relationships = save_relationships self.relationship_list = [] self.concept_list = [] self.prompts = {'summary': kwargs.get('summary_prompt', summary_prompt), 'relationship': kwargs.get('relationship_prompt', relationship_prompt)} self.verbose = verbose self.timestamp = datetime.now().strftime("%Y%m%d") # set output directory rng = [min(self.lesson_range), max(self.lesson_range)] if not output_dir: self.output_dir = Path(self.lesson_loader.project_dir) / \ f"ClassFactoryOutput/ConceptWeb/L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}" else: self.output_dir = Path(output_dir) / f"L{rng[0]}_{rng[1]}" if rng[0] != rng[1] else Path(output_dir) / f"L{rng[0]}" self.output_dir.mkdir(parents=True, exist_ok=True) # load user objectives and readings self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {} self.G = None self.readings = self._load_readings(self.lesson_range) self.kwargs = kwargs def _summarize_document(self, document: str) -> str: """ Summarizes a single document using the LLM. Args: document (str): Document content to summarize. Returns: str: Summarized content. """ return summarize_text(document, prompt=self.prompts['summary'], course_name=self.course_name, llm=self.llm) def _extract_relationships(self, summary: str, objectives: str) -> List[Tuple[str, str, str]]: """ Extracts relationships between concepts from a summary and objectives. Args: summary (str): Summarized document content. objectives (str): Lesson objectives for context. Returns: List[Tuple[str, str, str]]: List of relationships as (concept1, relation, concept2) tuples. """ return extract_relationships(summary, objectives, self.course_name, llm=self.llm, verbose=self.verbose)
[docs] def load_and_process_lessons(self, threshold: float = 0.995): """ Process lesson materials by summarizing content and extracting concept relationships. Args: threshold (float, optional): Similarity threshold for extracted concepts. Defaults to 0.995. For each lesson in `lesson_range`: - Load documents and objectives. - Summarize readings using the LLM. - Extract relationships between concepts and generates unique concept list. """ self.logger.info(f"\nLoading lessons from {self.lesson_loader.reading_dir}...") # summarize readings for lesson, readings in self.readings.items(): lesson_num = int(lesson) if not int(lesson_num) in self.lesson_range: self.logger.info(f"Lesson {lesson_num} not provided lesson range. Skipping this reading. " "If this is an error, adjust provided lesson_range") continue lesson_objectives = self._get_lesson_objectives(lesson_num) for document in readings: summary = self._summarize_document(document) relationships = self._extract_relationships(summary, lesson_objectives) self.relationship_list.extend(relationships) concepts = extract_concepts_from_relationships(relationships) self.concept_list.extend(concepts) # Process relationships to normalize concepts self.logger.info("\nExtracting concepts and relations") self.relationship_list = process_relationships(self.relationship_list, threshold=threshold) self.concept_list = list(set(self.concept_list)) # Ensure unique concepts
def _save_intermediate_data(self): """ Saves extracted concepts and relationships as JSON files in the output directory. (Triggered if `save_relationships`=True) Files saved: - `conceptlist_<timestamp>_Lsn_<lesson_range>.json`: List of unique concepts. - `relationship_list_<timestamp>_Lsn_<lesson_range>.json`: List of relationships. Raises: OSError: If saving files fails. """ with open(self.output_dir / f'conceptlist_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f: json.dump(self.concept_list, f) with open(self.output_dir / f'relationship_list_{self.timestamp}_Lsn_{self.lesson_range}.json', 'w') as f: json.dump(self.relationship_list, f) def _build_and_visualize_graph(self, method: str = 'leiden', directed: bool = False, dark_mode: bool = True): """ Construct and visualize a concept map graph, including community detection and word cloud generation. Steps: - Builds a concept graph where nodes represent concepts and edges represent relationships. - Detects communities based on the specified `method` ('leiden', 'louvain', or 'spectral'). - Generates an HTML visualization and word cloud. Args: method (str, optional): Community detection method. Defaults to 'leiden'. directed (bool, optional): If True, creates a directed graph. Defaults to False. dark_mode (bool): Sets graph to dark or white background. Defaults to True (dark mode). Raises: ValueError: If an unrecognized community detection method is used. """ self.logger.info("\nBuilding graph...") self.G = build_graph(processed_relationships=self.relationship_list, directed=directed) self.logger.info("\nDetecting communities...") # Skip community detection if there's only one lesson if len(self.lesson_range) <= 1: self.logger.info("\nSingle lesson detected. Skipping community detection.") # Assign all nodes to a single community for node in self.G.nodes: self.G.nodes[node]["community"] = 0 # Assign all nodes to community 0 else: self.logger.info("\nDetecting communities...") if method not in ['leiden', 'louvain', 'spectral']: raise ValueError("Community detection method not recognized. Please select from 'leiden', 'louvain', or 'spectral'.") self.G = detect_communities(self.G, method=method) output_html_path = self.output_dir / f"interactive_concept_map_{self.timestamp}_Lsn_{self.lesson_range}.html" visualize_graph_interactive(self.G, output_path=output_html_path, directed=directed, dark_mode=dark_mode) wordcloud_path = self.output_dir / f"concept_wordcloud_{self.timestamp}_Lsn_{self.lesson_range}.png" generate_wordcloud(self.concept_list, output_path=wordcloud_path)
[docs] def build_concept_map(self, directed: bool = False, concept_similarity_threshold: float = 0.995, dark_mode: bool = True, lesson_objectives: Optional[Dict[str, str]] = None) -> None: """ Execute the full pipeline to generate a concept map. Args: directed (bool, optional): Whether to create a directed concept map. Defaults to False. concept_similarity_threshold (float, optional): Threshold for concept similarity. Defaults to 0.995. dark_mode (bool, optional): Whether to use dark mode for visualization. Defaults to True. lesson_objectives (Optional[Dict[str, str]], optional): User-provided lesson objectives. Defaults to None. Raises: ValueError: If any process encounters invalid data. """ self.user_objectives = self.set_user_objectives(lesson_objectives, self.lesson_range) if lesson_objectives else {} method = self.kwargs.get('method', 'leiden') self.load_and_process_lessons(threshold=concept_similarity_threshold) if self.save_relationships: self._save_intermediate_data() self._build_and_visualize_graph(method=method, directed=directed, dark_mode=dark_mode)
if __name__ == "__main__": import os from pathlib import Path from dotenv import load_dotenv from langchain_community.llms import Ollama from langchain_openai import ChatOpenAI # env setup from pyprojroot.here import here from class_factory.utils.tools import reset_loggers reset_loggers() load_dotenv() user_home = Path.home() # Path definitions projectDir = here() readingDir = user_home / os.getenv('readingsDir') syllabus_path = user_home / os.getenv('syllabus_path') # pdf_syllabus_path = user_home / os.getenv('pdf_syllabus_path') # Example usage llm = ChatOpenAI( model="gpt-4o-mini", temperature=0.3, max_tokens=None, timeout=None, max_retries=2, api_key=os.getenv('openai_key'), organization=os.getenv('openai_org'), ) # llm = Ollama( # model="llama3.1", # temperature=0.1, # ) loader = LessonLoader(syllabus_path=syllabus_path, reading_dir=readingDir, project_dir=projectDir) builder = ConceptMapBuilder( lesson_loader=loader, llm=llm, course_name="American Politics", lesson_no=21, lesson_range=range(19, 21), output_dir=None, verbose=False ) builder.build_concept_map(directed=False, dark_mode=False)