import gradio as gr import subprocess import os import shutil import zipfile import re import tempfile from pathlib import Path from datetime import datetime import threading class HTTrackScraper: """HTTrack website scraper with progress tracking and logging.""" def __init__(self): self.log_buffer = [] self.current_progress = 0 def parse_url(self, url): """Clean and validate URL.""" url = url.strip() if not url.startswith(('http://', 'https://')): url = 'https://' + url return url def generate_project_name(self, url): """Generate a safe project name from URL.""" # Extract domain name domain = re.sub(r'^https?://(www\.)?', '', url) domain = re.sub(r'[^\w\-_.]', '_', domain.split('/')[0]) timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') return f"{domain}_{timestamp}" def add_log(self, message, level="INFO"): """Add a log message with timestamp.""" timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') log_entry = f"[{timestamp}] [{level}] {message}" self.log_buffer.append(log_entry) return "\n".join(self.log_buffer) def parse_httrack_output(self, line): """Parse HTTrack output for progress information.""" # Look for percentage or file counts if '%' in line: match = re.search(r'(\d+)%', line) if match: return int(match.group(1)) return None def scrape_website(self, url, max_depth=3, max_rate=1000000, respect_robots=True, progress=gr.Progress()): """ Scrape website using HTTrack. Args: url: Website URL to scrape max_depth: Maximum link depth to follow max_rate: Maximum download rate (bytes/sec, 0=unlimited) respect_robots: Whether to respect robots.txt progress: Gradio progress tracker """ try: # Initialize self.log_buffer = [] self.current_progress = 0 # Step 1: Validate URL progress(0.05, desc="🔍 Validating URL...") yield self.add_log(f"Starting scrape of: {url}"), None url = self.parse_url(url) yield self.add_log(f"Cleaned URL: {url}"), None # Step 2: Create temporary directory progress(0.1, desc="📁 Creating workspace...") temp_dir = tempfile.mkdtemp() project_name = self.generate_project_name(url) output_dir = os.path.join(temp_dir, project_name) os.makedirs(output_dir, exist_ok=True) yield self.add_log(f"Created temporary directory: {output_dir}"), None # Step 3: Build HTTrack command progress(0.15, desc="⚙️ Configuring HTTrack...") httrack_cmd = [ 'httrack', url, '-O', output_dir, f'-r{max_depth}', # Recursion depth '-v', # Verbose mode '-%v', # Extra verbose '-s0', # No robots.txt restrictions (if disabled) f'-A{max_rate}', # Max transfer rate '-c8', # 8 simultaneous connections '-%P', # Extended parsing '-N0', # Save all files in a single directory structure '-%k', # Keep original links when possible '-I0', # No index creation '*.css', '*.js', '*.png', '*.gif', '*.jpg', '*.jpeg', # Accept common file types '-F', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' # User agent ] if not respect_robots: httrack_cmd.append('-s0') yield self.add_log(f"Command: {' '.join(httrack_cmd)}"), None # Step 4: Run HTTrack progress(0.2, desc="🌐 Downloading website...") yield self.add_log("Starting HTTrack download process...", "INFO"), None try: process = subprocess.Popen( httrack_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, bufsize=1 ) # Monitor output line_count = 0 for line in process.stdout: line = line.strip() if line: line_count += 1 # Update progress based on HTTrack output percent = self.parse_httrack_output(line) if percent: # Map HTTrack progress (0-100%) to our range (20-80%) progress_value = 0.2 + (percent / 100.0 * 0.6) progress(progress_value, desc=f"🌐 Downloading... {percent}%") self.current_progress = percent # Log important messages if any(keyword in line.lower() for keyword in ['error', 'warning', 'done', 'finished', 'saved', 'scanned']): yield self.add_log(line[:200]), None # Limit line length # Update every 20 lines if line_count % 20 == 0: yield self.add_log(f"Processing... ({line_count} log lines)", "DEBUG"), None process.wait() if process.returncode != 0: raise Exception(f"HTTrack exited with code {process.returncode}") except Exception as e: yield self.add_log(f"HTTrack error: {str(e)}", "ERROR"), None raise progress(0.85, desc="✅ Download complete!") yield self.add_log("Website download completed successfully!", "SUCCESS"), None # Step 5: Create ZIP archive progress(0.9, desc="📦 Creating ZIP archive...") yield self.add_log("Creating ZIP archive..."), None zip_path = os.path.join(temp_dir, f"{project_name}.zip") with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: file_count = 0 for root, dirs, files in os.walk(output_dir): for file in files: file_path = os.path.join(root, file) arcname = os.path.relpath(file_path, output_dir) zipf.write(file_path, arcname) file_count += 1 if file_count % 50 == 0: progress(0.9 + (0.08 * min(file_count / 500, 1)), desc=f"📦 Archiving... ({file_count} files)") yield self.add_log(f"Created ZIP archive with {file_count} files"), None # Step 6: Complete progress(1.0, desc="✨ Complete!") yield self.add_log(f"Scraping complete! Archive ready for download.", "SUCCESS"), zip_path except Exception as e: error_msg = f"Error during scraping: {str(e)}" yield self.add_log(error_msg, "ERROR"), None raise gr.Error(error_msg) def create_app(): """Create and configure the Gradio interface.""" scraper = HTTrackScraper() # Custom CSS for better styling custom_css = """ .gradio-container { font-family: 'Inter', sans-serif; } .success-box { border-left: 4px solid #10b981; background-color: #f0fdf4; padding: 1rem; border-radius: 0.5rem; } .log-box { font-family: 'Monaco', 'Courier New', monospace; font-size: 0.875rem; line-height: 1.5; } """ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as app: gr.Markdown( """ # 🌐 HTTrack Website Scraper Download and archive websites for offline browsing, analysis, or backup. ⚠️ **Important**: Only scrape websites you have permission to download. Respect robots.txt and copyright laws. """ ) with gr.Row(): with gr.Column(scale=2): url_input = gr.Textbox( label="Website URL", placeholder="https://example.com", info="Enter the full URL of the website to scrape" ) with gr.Row(): max_depth = gr.Slider( minimum=1, maximum=10, value=3, step=1, label="Max Depth", info="How many link levels to follow (higher = more files)" ) max_rate = gr.Number( value=1000000, label="Max Rate (bytes/sec)", info="0 = unlimited. Limit bandwidth usage to be respectful." ) respect_robots = gr.Checkbox( value=True, label="Respect robots.txt", info="Honor website's robots.txt preferences" ) scrape_btn = gr.Button("🚀 Start Scraping", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown( """ ### 📋 Tips - **Depth 1-2**: Quick scrape, main pages only - **Depth 3-5**: Moderate scrape, most content - **Depth 6+**: Deep scrape, may take hours - Set max rate to avoid overloading servers - Always respect robots.txt unless you own the site - Large sites will take time - be patient! """ ) with gr.Row(): log_output = gr.Textbox( label="📄 Scraping Log", lines=15, max_lines=20, elem_classes=["log-box"], interactive=False, show_copy_button=True ) with gr.Row(): download_output = gr.File( label="📥 Download Archive", interactive=False ) gr.Markdown( """ --- ### 🔧 How it works 1. **Validation**: The URL is validated and cleaned 2. **Configuration**: HTTrack is configured with your settings 3. **Download**: The website is recursively downloaded 4. **Archiving**: All files are packaged into a ZIP archive 5. **Delivery**: The archive is ready for download ### ⚡ Technical Details - Uses HTTrack command-line tool for robust website mirroring - Preserves original link structure for offline browsing - Downloads HTML, CSS, JavaScript, images, and other assets - Creates compressed ZIP archive for easy distribution """ ) # Wire up the scraping function scrape_btn.click( fn=scraper.scrape_website, inputs=[url_input, max_depth, max_rate, respect_robots], outputs=[log_output, download_output], show_progress=True ) return app if __name__ == "__main__": app = create_app() app.queue() # Enable queuing for progress updates app.launch( server_name="0.0.0.0", server_port=7860, show_error=True )