Source code for class_factory.concept_web.build_concept_map

"""Build and analyze concept maps from relationship data.

This module provides functionality to create, analyze and visualize concept maps
based on relationships between concepts extracted from educational content.

Functions:
    build_graph: Create a weighted graph from concept relationships.
    detect_communities: Identify concept clusters using various community detection algorithms.

The module supports both directed and undirected graphs, with features including:
- Edge weight normalization
- Node centrality calculation
- Community detection using multiple algorithms (leiden, louvain, spectral)
- Visualization preparation with node sizes and community labels
"""
import logging
import os
from pathlib import Path
from typing import List, Tuple

import networkx as nx
import networkx.algorithms.community as nx_comm
from cdlib import algorithms
from dotenv import load_dotenv
from sklearn.cluster import SpectralClustering

# %%
from class_factory.concept_web.concept_extraction import process_relationships

os.environ["CDLIB_WARNINGS"] = "ignore"

# base libraries

# graph setup
# env setup

# self-defined utils

load_dotenv()

# Path definitions
projectDir = Path(os.getenv('projectDir'))
dataDir = projectDir / "tests/data/"

# %%



[docs]
def build_graph(
    processed_relationships: List[Tuple[str, str, str]],
    directed: bool = False
) -> nx.Graph | nx.DiGraph:
    """
    Build a weighted (directed or undirected) graph from processed concept relationships.

    Args:
        processed_relationships (List[Tuple[str, str, str]]): List of (concept1, relationship, concept2) tuples.
        directed (bool, optional): If True, creates a directed graph. Defaults to False.

    Returns:
        nx.Graph | nx.DiGraph: Graph with node and edge attributes for visualization and analysis.

    Raises:
        ValueError: If relationships are not correctly formatted.
    """
    # Initialize an undirected graph
    G = nx.DiGraph() if directed else nx.Graph()

    # Add nodes and edges from relationships
    for concept1, relationship, concept2 in processed_relationships:
        if relationship not in ["None", "none"]:
            if G.has_edge(concept1, concept2):
                # f"{concept1} -> {relationship} -> {concept2}")
                G[concept1][concept2]['relation'].add(relationship)
                G[concept1][concept2]['weight'] += 1
            else:
                # [f"{concept1} -> {relationship} -> {concept2}"])
                G.add_edge(concept1, concept2, weight=1,
                           relation={relationship})

    # Normalize edge weights and centrality
    edge_weights = nx.get_edge_attributes(G, 'weight').values()

    # Calculate min and max weights
    # Avoid division by zero
    max_weight = max(edge_weights) if edge_weights else 1
    # Avoid division by zero
    min_weight = min(edge_weights) if edge_weights else 1

    # Normalize edge weights
    min_normalized_weight = 0.5
    max_normalized_weight = 4

    try:
        for u, v, d in G.edges(data=True):
            normalized_weight = min_normalized_weight + (max_normalized_weight - min_normalized_weight) * \
                (d['weight'] - min_weight) / (max_weight - min_weight)
            G[u][v]['normalized_weight'] = normalized_weight

        # Calculate degree centrality for each node
        if directed:
            centrality = nx.in_degree_centrality(G)
        else:
            centrality = nx.degree_centrality(G)

        # Normalize centrality to a range suitable for text size (e.g., 10 to 50)
        min_size = 6
        max_size = 24
        max_centrality = max(centrality.values())
        min_centrality = min(centrality.values())

        for node, centrality_value in centrality.items():
            normalized_size = min_size + (max_size - min_size) * (
                centrality_value - min_centrality) / (max_centrality - min_centrality)
            G.nodes[node]['text_size'] = normalized_size
            G.nodes[node]['centrality'] = centrality_value

    except ZeroDivisionError:
        # Log a warning that the graph could not be normalized
        logging.warning(
            "Normalization of weights and centrality skipped due to lack of variation in the graph.\nReturning unnormalized edge weight and text size")
        # Fall back to default sizes if normalization fails
        for node in G.nodes():
            G.nodes[node]['text_size'] = 12  # Default text size
            G.nodes[node]['centrality'] = 0.5  # Default centrality

    return G




[docs]
def detect_communities(
    G: nx.Graph | nx.DiGraph,
    method: str = "leiden",
    num_clusters: int | None = None
) -> nx.Graph | nx.DiGraph:
    """
    Detect communities in a concept graph using the specified algorithm.

    Args:
        G (nx.Graph | nx.DiGraph): The input graph.
        method (str, optional): Community detection algorithm ('leiden', 'louvain', 'spectral'). Defaults to 'leiden'.
        num_clusters (int | None, optional): Number of clusters for spectral clustering. Defaults to None.

    Returns:
        nx.Graph | nx.DiGraph: Graph with 'community' node attributes.

    Raises:
        ValueError: If the specified method is not recognized.
    """
    G_copy = G.copy()

    if method == "leiden":
        # Use Louvain method for community detection
        communities_obj = algorithms.leiden(G)
        # extract communities from 'nodeclustering' object
        communities = communities_obj.communities
    elif method == "louvain":
        # Use Louvain method for community detection
        communities = nx_comm.louvain_communities(G)
    elif method == "spectral":
        # Create a list of node names to maintain the order
        nodes = list(G.nodes())

        # Create the adjacency matrix for the graph
        adj_matrix = nx.to_numpy_array(G, nodelist=nodes)

        # Apply spectral clustering
        sc = SpectralClustering(n_clusters=num_clusters,
                                affinity='precomputed', assign_labels='kmeans')
        labels = sc.fit_predict(adj_matrix)

        # Group nodes by their cluster labels using node names instead of indices
        communities = [set() for _ in range(num_clusters)]
        for node, label in zip(nodes, labels):
            communities[label].add(node)
    else:
        raise ValueError(
            f"Unknown method: {method}. Choose 'louvain' or 'spectral'.")

    # Assign each node to its community for visualization
    for i, community in enumerate(communities):
        for node in community:
            # Assign a group ID for use in visualization
            G_copy.nodes[node]['community'] = i

    return G_copy



if __name__ == "__main__":
    import json

    with open(Path.home() / dataDir / 'conceptlist_test.json', 'r') as f:
        conceptlist = json.load(f)

    with open(Path.home() / dataDir / 'relationshiplist_test.json', 'r') as f:
        relationship_list = json.load(f)
    # Build the graph
    G_base = build_graph(relationship_list, directed=True)
    # Detect communities using Louvain method
    G = detect_communities(G_base, method="leiden")

# %%