import gradio as gr
import subprocess
import os
import shutil
import zipfile
import re
import tempfile
from pathlib import Path
from datetime import datetime
import threading


class HTTrackScraper:
    """HTTrack website scraper with progress tracking and logging."""
    
    def __init__(self):
        self.log_buffer = []
        self.current_progress = 0
        
    def parse_url(self, url):
        """Clean and validate URL."""
        url = url.strip()
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        return url
    
    def generate_project_name(self, url):
        """Generate a safe project name from URL."""
        # Extract domain name
        domain = re.sub(r'^https?://(www\.)?', '', url)
        domain = re.sub(r'[^\w\-_.]', '_', domain.split('/')[0])
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        return f"{domain}_{timestamp}"
    
    def add_log(self, message, level="INFO"):
        """Add a log message with timestamp."""
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        log_entry = f"[{timestamp}] [{level}] {message}"
        self.log_buffer.append(log_entry)
        return "\n".join(self.log_buffer)
    
    def parse_httrack_output(self, line):
        """Parse HTTrack output for progress information."""
        # Look for percentage or file counts
        if '%' in line:
            match = re.search(r'(\d+)%', line)
            if match:
                return int(match.group(1))
        return None
    
    def scrape_website(self, url, max_depth=3, max_rate=1000000, 
                      respect_robots=True, progress=gr.Progress()):
        """
        Scrape website using HTTrack.
        
        Args:
            url: Website URL to scrape
            max_depth: Maximum link depth to follow
            max_rate: Maximum download rate (bytes/sec, 0=unlimited)
            respect_robots: Whether to respect robots.txt
            progress: Gradio progress tracker
        """
        try:
            # Initialize
            self.log_buffer = []
            self.current_progress = 0
            
            # Step 1: Validate URL
            progress(0.05, desc="🔍 Validating URL...")
            yield self.add_log(f"Starting scrape of: {url}"), None
            
            url = self.parse_url(url)
            yield self.add_log(f"Cleaned URL: {url}"), None
            
            # Step 2: Create temporary directory
            progress(0.1, desc="📁 Creating workspace...")
            temp_dir = tempfile.mkdtemp()
            project_name = self.generate_project_name(url)
            output_dir = os.path.join(temp_dir, project_name)
            os.makedirs(output_dir, exist_ok=True)
            
            yield self.add_log(f"Created temporary directory: {output_dir}"), None
            
            # Step 3: Build HTTrack command
            progress(0.15, desc="⚙️ Configuring HTTrack...")
            
            httrack_cmd = [
                'httrack',
                url,
                '-O', output_dir,
                f'-r{max_depth}',  # Recursion depth
                '-v',              # Verbose mode
                '-%v',             # Extra verbose
                '-s0',             # No robots.txt restrictions (if disabled)
                f'-A{max_rate}',   # Max transfer rate
                '-c8',             # 8 simultaneous connections
                '-%P',             # Extended parsing
                '-N0',             # Save all files in a single directory structure
                '-%k',             # Keep original links when possible
                '-I0',             # No index creation
                '*.css', '*.js', '*.png', '*.gif', '*.jpg', '*.jpeg',  # Accept common file types
                '-F', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'  # User agent
            ]
            
            if not respect_robots:
                httrack_cmd.append('-s0')
            
            yield self.add_log(f"Command: {' '.join(httrack_cmd)}"), None
            
            # Step 4: Run HTTrack
            progress(0.2, desc="🌐 Downloading website...")
            yield self.add_log("Starting HTTrack download process...", "INFO"), None
            
            try:
                process = subprocess.Popen(
                    httrack_cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    universal_newlines=True,
                    bufsize=1
                )
                
                # Monitor output
                line_count = 0
                for line in process.stdout:
                    line = line.strip()
                    if line:
                        line_count += 1
                        
                        # Update progress based on HTTrack output
                        percent = self.parse_httrack_output(line)
                        if percent:
                            # Map HTTrack progress (0-100%) to our range (20-80%)
                            progress_value = 0.2 + (percent / 100.0 * 0.6)
                            progress(progress_value, desc=f"🌐 Downloading... {percent}%")
                            self.current_progress = percent
                        
                        # Log important messages
                        if any(keyword in line.lower() for keyword in 
                               ['error', 'warning', 'done', 'finished', 'saved', 'scanned']):
                            yield self.add_log(line[:200]), None  # Limit line length
                        
                        # Update every 20 lines
                        if line_count % 20 == 0:
                            yield self.add_log(f"Processing... ({line_count} log lines)", "DEBUG"), None
                
                process.wait()
                
                if process.returncode != 0:
                    raise Exception(f"HTTrack exited with code {process.returncode}")
                
            except Exception as e:
                yield self.add_log(f"HTTrack error: {str(e)}", "ERROR"), None
                raise
            
            progress(0.85, desc="✅ Download complete!")
            yield self.add_log("Website download completed successfully!", "SUCCESS"), None
            
            # Step 5: Create ZIP archive
            progress(0.9, desc="📦 Creating ZIP archive...")
            yield self.add_log("Creating ZIP archive..."), None
            
            zip_path = os.path.join(temp_dir, f"{project_name}.zip")
            
            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
                file_count = 0
                for root, dirs, files in os.walk(output_dir):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arcname = os.path.relpath(file_path, output_dir)
                        zipf.write(file_path, arcname)
                        file_count += 1
                        
                        if file_count % 50 == 0:
                            progress(0.9 + (0.08 * min(file_count / 500, 1)), 
                                   desc=f"📦 Archiving... ({file_count} files)")
            
            yield self.add_log(f"Created ZIP archive with {file_count} files"), None
            
            # Step 6: Complete
            progress(1.0, desc="✨ Complete!")
            yield self.add_log(f"Scraping complete! Archive ready for download.", "SUCCESS"), zip_path
            
        except Exception as e:
            error_msg = f"Error during scraping: {str(e)}"
            yield self.add_log(error_msg, "ERROR"), None
            raise gr.Error(error_msg)


def create_app():
    """Create and configure the Gradio interface."""
    
    scraper = HTTrackScraper()
    
    # Custom CSS for better styling
    custom_css = """
    .gradio-container {
        font-family: 'Inter', sans-serif;
    }
    .success-box {
        border-left: 4px solid #10b981;
        background-color: #f0fdf4;
        padding: 1rem;
        border-radius: 0.5rem;
    }
    .log-box {
        font-family: 'Monaco', 'Courier New', monospace;
        font-size: 0.875rem;
        line-height: 1.5;
    }
    """
    
    with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as app:
        gr.Markdown(
            """
            # 🌐 HTTrack Website Scraper
            
            Download and archive websites for offline browsing, analysis, or backup.
            
            ⚠️ **Important**: Only scrape websites you have permission to download. 
            Respect robots.txt and copyright laws.
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                url_input = gr.Textbox(
                    label="Website URL",
                    placeholder="https://example.com",
                    info="Enter the full URL of the website to scrape"
                )
                
                with gr.Row():
                    max_depth = gr.Slider(
                        minimum=1,
                        maximum=10,
                        value=3,
                        step=1,
                        label="Max Depth",
                        info="How many link levels to follow (higher = more files)"
                    )
                    
                    max_rate = gr.Number(
                        value=1000000,
                        label="Max Rate (bytes/sec)",
                        info="0 = unlimited. Limit bandwidth usage to be respectful."
                    )
                
                respect_robots = gr.Checkbox(
                    value=True,
                    label="Respect robots.txt",
                    info="Honor website's robots.txt preferences"
                )
                
                scrape_btn = gr.Button("🚀 Start Scraping", variant="primary", size="lg")
            
            with gr.Column(scale=1):
                gr.Markdown(
                    """
                    ### 📋 Tips
                    
                    - **Depth 1-2**: Quick scrape, main pages only
                    - **Depth 3-5**: Moderate scrape, most content
                    - **Depth 6+**: Deep scrape, may take hours
                    
                    - Set max rate to avoid overloading servers
                    - Always respect robots.txt unless you own the site
                    - Large sites will take time - be patient!
                    """
                )
        
        with gr.Row():
            log_output = gr.Textbox(
                label="📄 Scraping Log",
                lines=15,
                max_lines=20,
                elem_classes=["log-box"],
                interactive=False,
                show_copy_button=True
            )
        
        with gr.Row():
            download_output = gr.File(
                label="📥 Download Archive",
                interactive=False
            )
        
        gr.Markdown(
            """
            ---
            
            ### 🔧 How it works
            
            1. **Validation**: The URL is validated and cleaned
            2. **Configuration**: HTTrack is configured with your settings
            3. **Download**: The website is recursively downloaded
            4. **Archiving**: All files are packaged into a ZIP archive
            5. **Delivery**: The archive is ready for download
            
            ### ⚡ Technical Details
            
            - Uses HTTrack command-line tool for robust website mirroring
            - Preserves original link structure for offline browsing
            - Downloads HTML, CSS, JavaScript, images, and other assets
            - Creates compressed ZIP archive for easy distribution
            """
        )
        
        # Wire up the scraping function
        scrape_btn.click(
            fn=scraper.scrape_website,
            inputs=[url_input, max_depth, max_rate, respect_robots],
            outputs=[log_output, download_output],
            show_progress=True
        )
    
    return app


if __name__ == "__main__":
    app = create_app()
    app.queue()  # Enable queuing for progress updates
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )