"""
Document Loading and Processing Module
-------------------------------------
This module provides functionality to load, process, and extract text from various document types
(PDF, DOCX, and TXT) for generating lesson-specific content. It includes support for OCR processing
of scanned documents and handling of structured educational materials like syllabi and lesson readings.
Classes
~~~~~~~
LessonLoader
Main class for handling document loading and processing operations.
Key Functions
~~~~~~~
The LessonLoader class provides these key functionalities:
- Document Loading:
- load_directory: Load all documents from a specified directory
- load_lessons: Load lessons from multiple directories with lesson number inference
- load_readings: Extract text from individual documents
- load_beamer_presentation: Load Beamer presentation content
- Syllabus Processing:
- extract_lesson_objectives: Extract objectives for specific lessons
- load_docx_syllabus: Load and parse DOCX syllabus content
- find_docx_indices: Locate lesson sections within syllabus
- Text Extraction:
- extract_text_from_pdf: Extract text from PDF files
- ocr_pdf: Perform OCR on scanned documents
- convert_pdf_to_docx: Convert PDF files to DOCX format
Dependencies
~~~~~~~~~~~
Core Dependencies:
- pypdf: PDF text extraction
- python-docx: DOCX file handling
- pathlib: File path operations
- typing: Type hints
Optional OCR Dependencies:
- pytesseract: OCR processing
- pdf2image: PDF to image conversion
- spacy: Text processing
- contextualSpellCheck: Text correction
- img2table: Table extraction from images
Example Usage
~~~~~~~~~~~~~
.. code-block:: python
from class_factory.utils.load_documents import LessonLoader
# Initialize loader with paths
loader = LessonLoader(
syllabus_path="path/to/syllabus.docx",
reading_dir="path/to/readings"
)
# Load specific lesson content
lesson_content = loader.load_lessons(lesson_number=5)
# Extract lesson objectives
objectives = loader.extract_lesson_objectives(current_lesson=5)
Notes
~~~~~
- OCR functionality requires additional package installation via `pip install class_factory[ocr]`
- Directory structure should follow consistent naming (e.g., 'L1', 'L2', etc.)
- Supports both PDF and DOCX syllabus formats with automatic conversion if needed
See Also
~~~~~~~~
- :class:`class_factory.utils.tools.logger_setup`: Logger configuration
- :mod:`class_factory.utils.base_model`: Base model implementation
"""
# %%
import logging
import re
import time
from pathlib import Path
from typing import Dict, List, Tuple, Union
import pypdf
from docx import Document
# doc import
from docx.opc.exceptions import PackageNotFoundError
from markitdown import (FileConversionException, MarkItDown,
UnsupportedFormatException)
from pdf2docx import Converter
from pyprojroot.here import here
from class_factory.utils.tools import logger_setup
try:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
except ImportError:
pytesseract = None
Image = None
convert_from_path = None
spacy = None
contextualSpellCheck = None
img2table = None
[docs]
class LessonLoader:
"""
A class for loading and managing educational content from various document formats.
This class handles loading and processing of lesson materials including syllabi, readings,
and Beamer presentations. It supports multiple file formats (PDF, DOCX, TXT) and provides
OCR capabilities for scanned documents when necessary.
Attributes:
reading_dir (Path): Directory containing lesson reading materials
slide_dir (Path): Directory containing Beamer presentation slides
project_dir (Path): Root project directory
syllabus_path (Path): Path to the course syllabus file
logger (Logger): Class logger instance
Args:
syllabus_path (Union[Path, str]): Path to the syllabus file
reading_dir (Union[Path, str]): Directory containing lesson readings
slide_dir (Union[Path, str], optional): Directory for Beamer slides. Defaults to None.
project_dir (Union[Path, str], optional): Root directory for the project. Defaults to None.
verbose (bool, optional): Whether to show detailed logging. Defaults to True.
"""
def __init__(self, syllabus_path: Union[Path, str], reading_dir: Union[Path, str],
slide_dir: Union[Path, str, None] = None, project_dir: Union[Path, str, None] = None,
verbose: bool = True, tabular_syllabus: bool = False):
"""
Initializes LessonLoader with paths and performs validation.
Args:
syllabus_path (Union[Path, str]): Path to the syllabus file.
reading_dir (Union[Path, str]): Directory for lesson readings.
slide_dir (Union[Path, str]): Directory for Beamer slides, Default is None.
project_dir (Union[Path, str]): Root directory for the project, Default is None. If not provided, returns pyprojroot.here.here()
log_level (int): Logging level for the LessonLoader instance.
"""
log_level = logging.INFO if verbose else logging.WARNING
self.logger = logger_setup(logger_name="lesson_loader_logger", log_level=log_level)
self.reading_dir = self._validate_dir_path(reading_dir, "reading directory")
self.slide_dir = self._validate_dir_path(slide_dir, "slide directory") if slide_dir else None
self.project_dir = self._validate_dir_path(project_dir, "root project directory") if project_dir else here()
self.syllabus_path = self._validate_file_path(syllabus_path, "syllabus file") if syllabus_path else None
if not syllabus_path:
self.logger.warning("No syllabus path provided. Generated content may not be as accurate or comprehensive without a syllabus.")
self.tabular_syllabus = tabular_syllabus
@property
def slide_dir(self):
return self._slide_dir
@slide_dir.setter
def slide_dir(self, value):
"""Update slide_dir and validate if a path is provided."""
if value is not None:
self._slide_dir = self._validate_dir_path(value, "slide directory")
self.logger.info(f"Slide directory updated to {self._slide_dir}")
else:
self._slide_dir = None # Clear if None is passed
def _validate_directory_structure(self):
"""
Validate the directory structure for lesson readings, ensuring that each directory
follows the expected pattern and contains supported file types.
Expected Directory Naming:
Each directory should correspond to a lesson and follow a naming pattern such as:
- "L1", "Lesson_1", "Lecture 1", "W1", "Week_1", etc.
- The pattern allows optional spaces or underscores between the lesson identifier (e.g., "L", "Lesson", etc.)
and the lesson number.
Supported File Types:
Each lesson directory should contain at least one reading file of type: .pdf, .txt, or .docx.
Raises:
ValueError: If the structure does not match the expected layout, listing any issues found.
"""
# Updated regex pattern to allow for flexible directory names with optional spaces or underscores
expected_pattern = r'^(L|Lesson|Lecture|W|Week)[\s_-]*\d+$'
issues = []
for subdir in self.reading_dir.iterdir():
if subdir.is_dir():
if not re.match(expected_pattern, subdir.name, re.IGNORECASE):
issues.append(f"Directory '{subdir.name}' does not match expected pattern '{expected_pattern}'. "
"Recommend consistent naming convention by lesson (e.g., 'L1', 'L2', etc.).")
# Check for supported file formats within each lesson directory
if not any(file.suffix in ['.pdf', '.txt', '.docx'] for file in subdir.iterdir() if file.is_file()):
issues.append(f"Directory '{subdir.name}' does not contain any supported reading files (.pdf, .txt, .docx).")
if issues:
self.logger.warning("Directory structure validation failed with the following issues:\n" + "\n".join(issues) +
"\nContinue if desired or this is a known issue in the provided directory.")
# raise ValueError("Directory structure validation failed with the following issues:\n" + "\n".join(issues))
@staticmethod
def _validate_file_path(path: Union[Path, str], name: str) -> Path:
"""
Validates that the given path is an existing file.
"""
path = Path(path)
if not path.is_file():
raise FileNotFoundError(f"The {name} at path '{path}' does not exist or is not a file.")
return path
@staticmethod
def _validate_dir_path(path: Union[Path, str], name: str, create_if_missing: bool = False) -> Path:
"""
Validates that the given path is an existing directory. Optionally creates it if missing.
Args:
path (Union[Path, str]): The directory path to validate.
name (str): Directory name for error messages.
create_if_missing (bool): If True, creates the directory if it doesn't exist.
"""
path = Path(path)
if not path.exists() and create_if_missing:
path.mkdir(parents=True)
elif not path.is_dir():
raise NotADirectoryError(f"The {name} at path '{path}' does not exist or is not a directory.")
return path
[docs]
@staticmethod
def ocr_available():
"""Validate if current packages installed support OCR"""
return all([pytesseract, Image, convert_from_path, spacy, contextualSpellCheck, img2table])
[docs]
@staticmethod
def missing_ocr_packages():
packages = {
"pytesseract": pytesseract,
"pillow": Image,
"pdf2image": convert_from_path,
"spacy": spacy,
"contextualSpellCheck": contextualSpellCheck,
"img2table": img2table
}
return [pkg_name for pkg_name, module in packages.items() if module is None]
[docs]
def load_readings(self, file_path: Union[str, Path]) -> str:
"""
Load text content from a single document file.
Args:
file_path (Union[str, Path]): Path to the document file to load
Returns:
str: Extracted text content prefixed with the file title
Raises:
ValueError: If file type is unsupported or file is corrupted
ImportError: If OCR packages are needed but not installed
"""
file_path = Path(file_path)
text = 'title: ' + file_path.stem + "\n"
try:
if file_path.suffix.lower() == '.pdf':
extracted_text = self.extract_text_from_pdf(file_path)
elif file_path.suffix.lower() == '.docx':
try:
doc = Document(str(file_path))
extracted_text = "\n".join([para.text for para in doc.paragraphs])
except PackageNotFoundError:
raise ValueError(f"Unable to open {file_path.name}. The file might be corrupted.")
elif file_path.suffix.lower() == '.txt':
try:
with open(file_path, 'r', encoding='utf-8') as file:
extracted_text = file.read()
except UnicodeDecodeError:
with open(file_path, 'r', encoding='ISO-8859-1') as file:
extracted_text = file.read()
else:
raise ValueError(f"Unsupported file type: {file_path.suffix}")
if not extracted_text.strip():
self.logger.warning(f"No readable text found in {file_path.name}. File may be empty or unreadable.")
except Exception as e:
self.logger.error(f"An error occurred while reading {file_path.name}: {e}")
raise e
return text + extracted_text if extracted_text.strip() else "No readable text found."
[docs]
def load_directory(self, load_from_dir: Union[Path, str]) -> List[str]:
"""
Load all valid document files within a directory.
Args:
load_from_dir (Union[Path, str]): Directory path to load documents from
Returns:
List[str]: List of extracted text content from all valid documents
"""
load_from_dir = Path(load_from_dir)
all_documents = []
for file in load_from_dir.glob('*'):
if file.suffix.lower() in ['.pdf', '.txt', '.docx']:
all_documents.append(self.load_readings(file))
return all_documents
[docs]
def load_lessons(self, lesson_number_or_range: Union[int, range], logger=None) -> Dict[str, List[str]]:
"""
Load specific lessons by scanning directories based on lesson numbers.
Args:
lesson_number_or_range (Union[int, range]): A single lesson number or range of lesson numbers to load.
logger: Optional custom logger.
Returns:
Dict[str, List[str]]: A dictionary where each key is a lesson number and each value is a list of readings.
"""
logger = logger or logger_setup(logger_name="load_lessons_func_logger", log_level=logging.WARNING)
lesson_range = (range(lesson_number_or_range, lesson_number_or_range + 1)
if isinstance(lesson_number_or_range, int)
else lesson_number_or_range)
all_readings = {}
# same pattern as validate_directory_structure
expected_pattern = r'^(L|Lesson|Lecture|W|Week)[\s_-]*(\d+)$'
filenames = []
subdirs = []
for subdir in self.reading_dir.iterdir():
if subdir.is_dir():
# load actual files
match = re.match(expected_pattern, subdir.name, re.IGNORECASE)
if match:
lesson_no = int(match.group(2))
if lesson_no in lesson_range:
logger.info(f"Loading readings for lesson {lesson_no} from directory: {subdir.name}")
all_readings[str(lesson_no)] = self.load_directory(subdir)
# load filenames for user
for file in subdir.glob('*'):
if file.suffix.lower() in ['.pdf', '.txt', '.docx']:
filenames.append(file.name)
subdirs.append(subdir.name)
else:
logger.info(f"Skipping directory {subdir.name} as it is not in the specified lesson range.")
else:
logger.warning(f"Directory '{subdir.name}' does not match expected lesson naming pattern.")
pretty_subdirs = ", ".join(list(dict.fromkeys(subdirs))) if subdirs else "(none)"
if filenames:
pretty = "\n - " + "\n - ".join(filenames)
self.logger.info(
f"""Lesson {lesson_number_or_range}: using the following readings:\n{pretty}
\nIf this is incorrect, please add the correct readings to the respective lesson folder(s): {pretty_subdirs}."""
)
else:
self.logger.warning(
f"""Lesson {lesson_number_or_range}: no readings assigned. If this is unexpected, please add the correct readings to the respective lesson folder(s): {
pretty_subdirs}."""
)
return all_readings
[docs]
def load_beamer_presentation(self, tex_path: Path) -> str:
"""
Loas a Beamer presentation from a .tex file and returns it as a string.
Args:
tex_path (Path): The path to the .tex file containing the Beamer presentation.
Returns:
str: The content of the .tex file.
"""
if tex_path == "No prior presentation available.":
return tex_path
tex_path = Path(tex_path)
with open(tex_path, 'r', encoding='utf-8') as file:
beamer_text = file.read()
return beamer_text
[docs]
def find_prior_beamer_presentation(self, lesson_no: int, max_attempts: int = 3) -> Path | str:
"""
Dynamically finds the most recent prior lesson to use as a template for slide creation.
Args:
lesson_no (int): The current lesson number.
max_attempts (int): The maximum number of previous lessons to attempt loading (default 3).
Returns:
Path | str: The path to the found Beamer file from a prior lesson.
Raises:
FileNotFoundError: If no valid prior lesson file is found within the `max_attempts` range.
"""
for i in range(1, max_attempts + 1):
prior_lesson = lesson_no - i
beamer_file = self.slide_dir / f'L{prior_lesson}.tex' if self.slide_dir else Path(f'L{prior_lesson}.tex')
# Check if the Beamer file exists for this prior lesson
if beamer_file.is_file():
self.logger.info(f"Found prior lesson: Lesson {prior_lesson}")
return beamer_file
# Raise error if no valid prior Beamer file is found within the attempts
self.logger.error(f"No prior Beamer file found within the last {max_attempts} lessons.\nReturning empty.")
return "No prior presentation available."
[docs]
def ocr_pdf(self, pdf_path: Path, max_workers: int = 4) -> str:
"""
Perform OCR on a PDF file to extract text content.
Args:
pdf_path (Path): Path to the PDF file
max_workers (int, optional): Number of parallel workers for OCR. Defaults to 4.
Returns:
str: Extracted text content from OCR
"""
import pytesseract
from pdf2image import convert_from_path
pdf_path = Path(pdf_path)
images = convert_from_path(str(pdf_path), dpi=300)
ocr_text = []
for image in images:
text = pytesseract.image_to_string(image)
ocr_text.append(text)
return " ".join(ocr_text)
[docs]
def load_docx_syllabus(self, syllabus_path) -> List[str]:
max_retries = 3
retry_delay = 10 # seconds
syllabus_path = Path(syllabus_path)
md = MarkItDown()
for attempt in range(max_retries):
try:
raw_content = md.convert(str(syllabus_path))
lines = raw_content.text_content.split("\n\n")
for line in lines:
# Handle table-like rows in lesson objectives
if line.startswith("|"):
tablines = line.split("\n")
lines.remove(line)
lines.extend(tablines)
return lines
except (PackageNotFoundError, FileConversionException, UnsupportedFormatException, PermissionError) as e:
if attempt < max_retries - 1:
print(f"Document `{syllabus_path.name}` is currently open. Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
raise PackageNotFoundError("Unable to open the document after multiple attempts. Please close the file and try again.")
[docs]
def find_docx_indices(self, syllabus: List[str], current_lesson: int, lesson_identifier: str = ""
) -> Tuple[int | None, int | None, int | None, int | None]:
"""
Finds the indices of the lessons in the syllabus content.
Args:
syllabus (List[str]): A list of strings where each string represents a line in the syllabus document.
current_lesson (int): The lesson number for which to find surrounding lessons.
lesson_identifier (str, Defaults to None): The special word indicating a new lesson on the syllabus (eg "Lesson" or "Week")
Returns:
Tuple[int, int, int, int]: The indices of the previous, current, next, and the end of the next lesson.
"""
prev_lesson, curr_lesson, next_lesson, end_lesson = None, None, None, None
if self.tabular_syllabus:
# Search tables if no matches found
if curr_lesson is None:
# Matches table rows starting with numbers or "|"
lesson_pattern = re.compile(r"^(\d+)\s*\||^\s*\|\s*(\d+)\s*\|")
for i, line in enumerate(syllabus):
match = lesson_pattern.match(line)
if match:
matches = match.groups()
# Extract numeric lesson from match groups
lesson_number = int(matches[0] or matches[1])
if lesson_number == current_lesson - 1:
prev_lesson = i
elif lesson_number == current_lesson:
curr_lesson = i
elif lesson_number == current_lesson + 1:
next_lesson = i
elif lesson_number == current_lesson + 2:
end_lesson = i
# break
return prev_lesson, curr_lesson, next_lesson, end_lesson
else:
if not lesson_identifier:
lesson_identifiers = ['Lesson', 'Week', "**Lesson", "**Week"]
for lesson_identifier in lesson_identifiers:
escaped_identifier = re.escape(lesson_identifier)
lesson_pattern = re.compile(
rf"{escaped_identifier}\s*{current_lesson}.*?:")
for i, line in enumerate(syllabus):
if re.search(rf"{escaped_identifier}\s*{current_lesson - 1}.*?:?", line):
prev_lesson = i
elif lesson_pattern.search(line):
curr_lesson = i
elif re.search(rf"{escaped_identifier}\s*{current_lesson + 1}.*?:?", line):
next_lesson = i
elif re.search(rf"{escaped_identifier}\s*{current_lesson + 2}.*?:?", line):
end_lesson = i
break
if curr_lesson is not None:
break
else:
escaped_identifier = re.escape(lesson_identifier)
lesson_pattern = re.compile(
rf"{escaped_identifier}\s*{current_lesson}.*?:")
for i, line in enumerate(syllabus):
if re.search(rf"{escaped_identifier}\s*{current_lesson - 1}.*?:?", line):
prev_lesson = i
elif lesson_pattern.search(line):
curr_lesson = i
elif re.search(rf"{escaped_identifier}\s*{current_lesson + 1}.*?:?", line):
next_lesson = i
elif re.search(rf"{escaped_identifier}\s*{current_lesson + 2}.*?:?", line):
end_lesson = i
break
if curr_lesson is None:
self.logger.warning(
f"Lesson {current_lesson} not found in syllabus. Is this a tabular syllabus? Consider setting `tabular_syllabus=True`.")
return prev_lesson, curr_lesson, next_lesson, end_lesson
# %%
if __name__ == "__main__":
import os
import yaml
from dotenv import load_dotenv
user_home = Path.home()
load_dotenv()
pdf_syllabus_path = user_home / os.getenv('pdf_syllabus_path', "")
with open("class_config.yaml", "r") as file:
config = yaml.safe_load(file)
class_config = config['PS460']
slide_dir = user_home / class_config['slideDir']
syllabus_path = user_home / class_config['syllabus_path']
readingsDir = user_home / class_config['reading_dir']
is_tabular_syllabus = class_config['is_tabular_syllabus']
nontab_syllabus = user_home / config['PS211']['syllabus_path']
lsn = 8
loader = LessonLoader(syllabus_path=syllabus_path,
reading_dir=readingsDir,
slide_dir=slide_dir,
tabular_syllabus=is_tabular_syllabus)
objs = loader.extract_lesson_objectives(
current_lesson=lsn)
docs = loader.load_lessons(lesson_number_or_range=range(12, 14))
ocrDir = Path("C:/Users/Sean/OneDrive - afacademy.af.edu/Documents/Classes/Fall 2024/PS211/02_Class Readings/L21/reference")
pdf_path = ocrDir / "21.3 Pew Research Center. Beyond Red vs Blue Overview.pdf"
ocr_result = loader.ocr_pdf(pdf_path)
# %%