Spaces:
Runtime error
Runtime error
| import os | |
| import shutil | |
| import subprocess | |
| import tempfile | |
| from loguru import logger | |
| from llm_engineering.domain.documents import RepositoryDocument | |
| from .base import BaseCrawler | |
| class GithubCrawler(BaseCrawler): | |
| model = RepositoryDocument | |
| def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None: | |
| super().__init__() | |
| self._ignore = ignore | |
| def extract(self, link: str, **kwargs) -> None: | |
| old_model = self.model.find(link=link) | |
| if old_model is not None: | |
| logger.info(f"Repository already exists in the database: {link}") | |
| return | |
| logger.info(f"Starting scrapping GitHub repository: {link}") | |
| repo_name = link.rstrip("/").split("/")[-1] | |
| local_temp = tempfile.mkdtemp() | |
| try: | |
| os.chdir(local_temp) | |
| subprocess.run(["git", "clone", link]) | |
| repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118 | |
| tree = {} | |
| for root, _, files in os.walk(repo_path): | |
| dir = root.replace(repo_path, "").lstrip("/") | |
| if dir.startswith(self._ignore): | |
| continue | |
| for file in files: | |
| if file.endswith(self._ignore): | |
| continue | |
| file_path = os.path.join(dir, file) # noqa: PTH118 | |
| with open(os.path.join(root, file), "r", errors="ignore") as f: # noqa: PTH123, PTH118 | |
| tree[file_path] = f.read().replace(" ", "") | |
| user = kwargs["user"] | |
| instance = self.model( | |
| content=tree, | |
| name=repo_name, | |
| link=link, | |
| platform="github", | |
| author_id=user.id, | |
| author_full_name=user.full_name, | |
| ) | |
| instance.save() | |
| except Exception: | |
| raise | |
| finally: | |
| shutil.rmtree(local_temp) | |
| logger.info(f"Finished scrapping GitHub repository: {link}") | |