#!/usr/bin/env python3
# ***************************************************************************************************************************
# * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
# * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
# * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
# * with the License.  You may obtain a copy of the License at                                                              *
# *                                                                                                                         *
# *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
# *                                                                                                                         *
# * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
# * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
# * specific language governing permissions and limitations under the License.                                              *
# ***************************************************************************************************************************
"""
Script to check for correct topic links in the Juneau source tree.

This script:
1. Scans /docs/pages/topics for all markdown files
2. Extracts slug names and titles from the frontmatter
3. Scans the entire source tree for links to https://juneau.apache.org/docs/topics/SLUG">TITLE</a>
4. Reports any mismatches between expected and actual slug/title combinations
"""

import os
import re
import sys
from pathlib import Path

def extract_topic_info(docs_dir):
    """Extract slug and title information from all topic markdown files."""
    topics = {}
    topics_dir = Path(docs_dir) / "pages" / "topics"
    
    if not topics_dir.exists():
        print(f"ERROR: Topics directory not found: {topics_dir}")
        return topics
    
    for md_file in topics_dir.glob("*.md"):
        if md_file.name == "README.md":
            continue
            
        try:
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Extract frontmatter
            frontmatter_match = re.search(r'^---\n(.*?)\n---', content, re.DOTALL)
            if frontmatter_match:
                frontmatter = frontmatter_match.group(1)
                
                # Extract title and slug
                title_match = re.search(r'^title:\s*["\']?([^"\']+)["\']?', frontmatter, re.MULTILINE)
                slug_match = re.search(r'^slug:\s*([^\s]+)', frontmatter, re.MULTILINE)
                
                if title_match and slug_match:
                    title = title_match.group(1).strip()
                    slug = slug_match.group(1).strip()
                    topics[slug] = title
                    print(f"Found topic: {slug} -> {title}")
                else:
                    print(f"WARNING: Could not extract title/slug from {md_file.name}")
            else:
                print(f"WARNING: No frontmatter found in {md_file.name}")
                
        except Exception as e:
            print(f"ERROR: Failed to process {md_file.name}: {e}")
    
    return topics

def find_topic_links(source_dir):
    """Find all topic links in the source tree."""
    links = []
    
    # Pattern to match topic links
    link_pattern = re.compile(r'https://juneau\.apache\.org/docs/topics/([^"]+)">([^<]+)</a>')
    
    # File extensions to search
    extensions = {'.java', '.md', '.xml', '.properties', '.txt', '.adoc', '.rst'}
    
    for root, dirs, files in os.walk(source_dir):
        # Skip certain directories
        dirs[:] = [d for d in dirs if not d.startswith('.') and d not in {'target', 'node_modules', 'build', 'dist', 'scripts'}]
        
        for file in files:
            # Skip report files generated by this script
            if file.startswith('topic-link-check-') and file.endswith('.txt'):
                continue
                
            if any(file.endswith(ext) for ext in extensions):
                file_path = Path(root) / file
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        
                        # For markdown files, exclude code blocks
                        if file.endswith('.md'):
                            # Remove code blocks (```...```) from content before searching
                            import re as re_module
                            code_block_pattern = re_module.compile(r'```.*?```', re_module.DOTALL)
                            content = code_block_pattern.sub('', content)
                            # Also remove inline code (`...`)
                            inline_code_pattern = re_module.compile(r'`[^`]+`')
                            content = inline_code_pattern.sub('', content)
                        
                        for match in link_pattern.finditer(content):
                            slug = match.group(1)
                            title = match.group(2)
                            links.append({
                                'file': str(file_path.relative_to(source_dir)),
                                'line': content[:match.start()].count('\n') + 1,
                                'slug': slug,
                                'title': title,
                                'full_match': match.group(0)
                            })
                except Exception as e:
                    print(f"ERROR: Failed to process {file_path}: {e}")
    
    return links

def check_links(links, topics):
    """Check if the found links match the expected slug/title combinations."""
    warnings = []
    
    for link in links:
        slug = link['slug']
        title = link['title']
        
        if slug in topics:
            expected_title = topics[slug]
            if title != expected_title:
                warnings.append({
                    'type': 'title_mismatch',
                    'file': link['file'],
                    'line': link['line'],
                    'slug': slug,
                    'expected_title': expected_title,
                    'actual_title': title,
                    'full_match': link['full_match']
                })
        else:
            warnings.append({
                'type': 'unknown_slug',
                'file': link['file'],
                'line': link['line'],
                'slug': slug,
                'title': title,
                'full_match': link['full_match']
            })
    
    return warnings

def main():
    # Get the script directory (should be /juneau/scripts)
    script_dir = Path(__file__).parent
    juneau_root = script_dir.parent
    docs_dir = juneau_root / "docs"
    
    print("Juneau Topic Link Checker")
    print("=" * 50)
    
    # Extract topic information
    print("\nExtracting topic information from docs...")
    topics = extract_topic_info(docs_dir)
    
    if not topics:
        print("ERROR: No topics found. Check if docs directory exists and contains topic files.")
        sys.exit(1)
    
    print(f"\nFound {len(topics)} topics")
    
    # Find all topic links
    print("\nScanning source tree for topic links...")
    links = find_topic_links(juneau_root)
    
    print(f"Found {len(links)} topic links")
    
    # Check for issues
    print("\nChecking links for issues...")
    warnings = check_links(links, topics)
    
    # Report results
    if warnings:
        print(f"\nWARNINGS ({len(warnings)} found):")
        print("=" * 50)
        
        for warning in warnings:
            if warning['type'] == 'title_mismatch':
                print(f"TITLE MISMATCH:")
                print(f"  File: {warning['file']}:{warning['line']}")
                print(f"  Slug: {warning['slug']}")
                print(f"  Expected title: '{warning['expected_title']}'")
                print(f"  Actual title:   '{warning['actual_title']}'")
                print(f"  Link: {warning['full_match']}")
                print()
            elif warning['type'] == 'unknown_slug':
                print(f"UNKNOWN SLUG:")
                print(f"  File: {warning['file']}:{warning['line']}")
                print(f"  Slug: {warning['slug']}")
                print(f"  Title: '{warning['title']}'")
                print(f"  Link: {warning['full_match']}")
                print()
        
        print(f"Total warnings: {len(warnings)}")
        print("\nNote: Warnings are informational only and do not fail the build.")
        sys.exit(0)  # Exit successfully with warnings
    else:
        print("\n✓ All topic links are correct!")
        sys.exit(0)

if __name__ == "__main__":
    main()
