Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from langchain.tools import Tool | |
| def scrape_webpage(url: str) -> str: | |
| """ | |
| Fetches the textual content of a webpage. | |
| """ | |
| try: | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (compatible; AI-Agent/1.0)" | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Extract main text content (without scripts, styles) | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| text = " ".join(soup.stripped_strings) | |
| return text[:3000] # Limit the output length to avoid overloading the LLM | |
| except Exception as e: | |
| return f"[WebScraper error: {e}]" | |
| # Register as a LangChain Tool | |
| web_scraper_tool = Tool.from_function( | |
| name="web_scraper", | |
| description="Fetches and extracts main text content from a webpage using its URL.", | |
| func=scrape_webpage | |
| ) | |