Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import subprocess | |
| import os | |
| import shutil | |
| import zipfile | |
| import re | |
| import tempfile | |
| from pathlib import Path | |
| from datetime import datetime | |
| import threading | |
| class HTTrackScraper: | |
| """HTTrack website scraper with progress tracking and logging.""" | |
| def __init__(self): | |
| self.log_buffer = [] | |
| self.current_progress = 0 | |
| def parse_url(self, url): | |
| """Clean and validate URL.""" | |
| url = url.strip() | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| return url | |
| def generate_project_name(self, url): | |
| """Generate a safe project name from URL.""" | |
| # Extract domain name | |
| domain = re.sub(r'^https?://(www\.)?', '', url) | |
| domain = re.sub(r'[^\w\-_.]', '_', domain.split('/')[0]) | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| return f"{domain}_{timestamp}" | |
| def add_log(self, message, level="INFO"): | |
| """Add a log message with timestamp.""" | |
| timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
| log_entry = f"[{timestamp}] [{level}] {message}" | |
| self.log_buffer.append(log_entry) | |
| return "\n".join(self.log_buffer) | |
| def parse_httrack_output(self, line): | |
| """Parse HTTrack output for progress information.""" | |
| # Look for percentage or file counts | |
| if '%' in line: | |
| match = re.search(r'(\d+)%', line) | |
| if match: | |
| return int(match.group(1)) | |
| return None | |
| def scrape_website(self, url, max_depth=3, max_rate=1000000, | |
| respect_robots=True, progress=gr.Progress()): | |
| """ | |
| Scrape website using HTTrack. | |
| Args: | |
| url: Website URL to scrape | |
| max_depth: Maximum link depth to follow | |
| max_rate: Maximum download rate (bytes/sec, 0=unlimited) | |
| respect_robots: Whether to respect robots.txt | |
| progress: Gradio progress tracker | |
| """ | |
| try: | |
| # Initialize | |
| self.log_buffer = [] | |
| self.current_progress = 0 | |
| # Step 1: Validate URL | |
| progress(0.05, desc="π Validating URL...") | |
| yield self.add_log(f"Starting scrape of: {url}"), None | |
| url = self.parse_url(url) | |
| yield self.add_log(f"Cleaned URL: {url}"), None | |
| # Step 2: Create temporary directory | |
| progress(0.1, desc="π Creating workspace...") | |
| temp_dir = tempfile.mkdtemp() | |
| project_name = self.generate_project_name(url) | |
| output_dir = os.path.join(temp_dir, project_name) | |
| os.makedirs(output_dir, exist_ok=True) | |
| yield self.add_log(f"Created temporary directory: {output_dir}"), None | |
| # Step 3: Build HTTrack command | |
| progress(0.15, desc="βοΈ Configuring HTTrack...") | |
| httrack_cmd = [ | |
| 'httrack', | |
| url, | |
| '-O', output_dir, | |
| f'-r{max_depth}', # Recursion depth | |
| '-v', # Verbose mode | |
| '-%v', # Extra verbose | |
| '-s0', # No robots.txt restrictions (if disabled) | |
| f'-A{max_rate}', # Max transfer rate | |
| '-c8', # 8 simultaneous connections | |
| '-%P', # Extended parsing | |
| '-N0', # Save all files in a single directory structure | |
| '-%k', # Keep original links when possible | |
| '-I0', # No index creation | |
| '*.css', '*.js', '*.png', '*.gif', '*.jpg', '*.jpeg', # Accept common file types | |
| '-F', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' # User agent | |
| ] | |
| if not respect_robots: | |
| httrack_cmd.append('-s0') | |
| yield self.add_log(f"Command: {' '.join(httrack_cmd)}"), None | |
| # Step 4: Run HTTrack | |
| progress(0.2, desc="π Downloading website...") | |
| yield self.add_log("Starting HTTrack download process...", "INFO"), None | |
| try: | |
| process = subprocess.Popen( | |
| httrack_cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| universal_newlines=True, | |
| bufsize=1 | |
| ) | |
| # Monitor output | |
| line_count = 0 | |
| for line in process.stdout: | |
| line = line.strip() | |
| if line: | |
| line_count += 1 | |
| # Update progress based on HTTrack output | |
| percent = self.parse_httrack_output(line) | |
| if percent: | |
| # Map HTTrack progress (0-100%) to our range (20-80%) | |
| progress_value = 0.2 + (percent / 100.0 * 0.6) | |
| progress(progress_value, desc=f"π Downloading... {percent}%") | |
| self.current_progress = percent | |
| # Log important messages | |
| if any(keyword in line.lower() for keyword in | |
| ['error', 'warning', 'done', 'finished', 'saved', 'scanned']): | |
| yield self.add_log(line[:200]), None # Limit line length | |
| # Update every 20 lines | |
| if line_count % 20 == 0: | |
| yield self.add_log(f"Processing... ({line_count} log lines)", "DEBUG"), None | |
| process.wait() | |
| if process.returncode != 0: | |
| raise Exception(f"HTTrack exited with code {process.returncode}") | |
| except Exception as e: | |
| yield self.add_log(f"HTTrack error: {str(e)}", "ERROR"), None | |
| raise | |
| progress(0.85, desc="β Download complete!") | |
| yield self.add_log("Website download completed successfully!", "SUCCESS"), None | |
| # Step 5: Create ZIP archive | |
| progress(0.9, desc="π¦ Creating ZIP archive...") | |
| yield self.add_log("Creating ZIP archive..."), None | |
| zip_path = os.path.join(temp_dir, f"{project_name}.zip") | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
| file_count = 0 | |
| for root, dirs, files in os.walk(output_dir): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| arcname = os.path.relpath(file_path, output_dir) | |
| zipf.write(file_path, arcname) | |
| file_count += 1 | |
| if file_count % 50 == 0: | |
| progress(0.9 + (0.08 * min(file_count / 500, 1)), | |
| desc=f"π¦ Archiving... ({file_count} files)") | |
| yield self.add_log(f"Created ZIP archive with {file_count} files"), None | |
| # Step 6: Complete | |
| progress(1.0, desc="β¨ Complete!") | |
| yield self.add_log(f"Scraping complete! Archive ready for download.", "SUCCESS"), zip_path | |
| except Exception as e: | |
| error_msg = f"Error during scraping: {str(e)}" | |
| yield self.add_log(error_msg, "ERROR"), None | |
| raise gr.Error(error_msg) | |
| def create_app(): | |
| """Create and configure the Gradio interface.""" | |
| scraper = HTTrackScraper() | |
| # Custom CSS for better styling | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .success-box { | |
| border-left: 4px solid #10b981; | |
| background-color: #f0fdf4; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| } | |
| .log-box { | |
| font-family: 'Monaco', 'Courier New', monospace; | |
| font-size: 0.875rem; | |
| line-height: 1.5; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as app: | |
| gr.Markdown( | |
| """ | |
| # π HTTrack Website Scraper | |
| Download and archive websites for offline browsing, analysis, or backup. | |
| β οΈ **Important**: Only scrape websites you have permission to download. | |
| Respect robots.txt and copyright laws. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| url_input = gr.Textbox( | |
| label="Website URL", | |
| placeholder="https://example.com", | |
| info="Enter the full URL of the website to scrape" | |
| ) | |
| with gr.Row(): | |
| max_depth = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=3, | |
| step=1, | |
| label="Max Depth", | |
| info="How many link levels to follow (higher = more files)" | |
| ) | |
| max_rate = gr.Number( | |
| value=1000000, | |
| label="Max Rate (bytes/sec)", | |
| info="0 = unlimited. Limit bandwidth usage to be respectful." | |
| ) | |
| respect_robots = gr.Checkbox( | |
| value=True, | |
| label="Respect robots.txt", | |
| info="Honor website's robots.txt preferences" | |
| ) | |
| scrape_btn = gr.Button("π Start Scraping", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| """ | |
| ### π Tips | |
| - **Depth 1-2**: Quick scrape, main pages only | |
| - **Depth 3-5**: Moderate scrape, most content | |
| - **Depth 6+**: Deep scrape, may take hours | |
| - Set max rate to avoid overloading servers | |
| - Always respect robots.txt unless you own the site | |
| - Large sites will take time - be patient! | |
| """ | |
| ) | |
| with gr.Row(): | |
| log_output = gr.Textbox( | |
| label="π Scraping Log", | |
| lines=15, | |
| max_lines=20, | |
| elem_classes=["log-box"], | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| with gr.Row(): | |
| download_output = gr.File( | |
| label="π₯ Download Archive", | |
| interactive=False | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### π§ How it works | |
| 1. **Validation**: The URL is validated and cleaned | |
| 2. **Configuration**: HTTrack is configured with your settings | |
| 3. **Download**: The website is recursively downloaded | |
| 4. **Archiving**: All files are packaged into a ZIP archive | |
| 5. **Delivery**: The archive is ready for download | |
| ### β‘ Technical Details | |
| - Uses HTTrack command-line tool for robust website mirroring | |
| - Preserves original link structure for offline browsing | |
| - Downloads HTML, CSS, JavaScript, images, and other assets | |
| - Creates compressed ZIP archive for easy distribution | |
| """ | |
| ) | |
| # Wire up the scraping function | |
| scrape_btn.click( | |
| fn=scraper.scrape_website, | |
| inputs=[url_input, max_depth, max_rate, respect_robots], | |
| outputs=[log_output, download_output], | |
| show_progress=True | |
| ) | |
| return app | |
| if __name__ == "__main__": | |
| app = create_app() | |
| app.queue() # Enable queuing for progress updates | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |