Spaces:
Running
Running
| import gradio as gr | |
| from datasets import load_dataset | |
| import nbformat | |
| from nbconvert import HTMLExporter | |
| from traitlets.config import Config | |
| import os | |
| import shutil | |
| import base64 | |
| # Configuration for HTMLExporter | |
| config = Config() | |
| config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"] | |
| html_exporter = HTMLExporter(config=config, template_name="classic") | |
| ds = load_dataset("data-agents/kaggle-notebooks", split="train", streaming=True) | |
| ds_iter = iter(ds) | |
| TMP_DIR = './tmp/' | |
| def reset_tmp_folder(): | |
| if os.path.exists(TMP_DIR): | |
| shutil.rmtree(TMP_DIR) | |
| os.makedirs(TMP_DIR) | |
| def embed_figures(html_body, resources): | |
| for key, value in resources['outputs'].items(): | |
| b64_figure = base64.b64encode(value).decode('utf-8') | |
| img_tag = f'data:image/png;base64,{b64_figure}' | |
| html_body = html_body.replace(key, img_tag) | |
| return html_body | |
| def parse_notebook(filter_options, progress=gr.Progress()): | |
| reset_tmp_folder() | |
| found_notebook = False | |
| counter = 0 | |
| while not found_notebook: | |
| progress((counter, None), desc="Parsing and filtering notebooks...", unit="steps") | |
| counter += 1 | |
| notebook_data = next(ds_iter) | |
| notebook_string = notebook_data["text"] | |
| notebook_id = notebook_data["id"].split("/")[-1] | |
| if filter_options == "none": | |
| found_notebook = True | |
| elif filter_options == ">10MB": | |
| if len(notebook_string)>10 * 1024 * 1024: | |
| found_notebook = True | |
| elif filter_options == ">1MB": | |
| if len(notebook_string)>1 * 1024 * 1024: | |
| found_notebook = True | |
| elif filter_options == ">100KB": | |
| if len(notebook_string)>100 * 1024: | |
| found_notebook = True | |
| elif filter_options == ">10KB": | |
| if len(notebook_string)>10 * 1024: | |
| found_notebook = True | |
| # too slow | |
| elif filter_options == "has outputs": | |
| notebook_parsed = nbformat.reads(notebook_string, as_version=4) | |
| (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed) | |
| if len(resources["outputs"])>0: | |
| found_notebook = True | |
| out_path = os.path.join(TMP_DIR, notebook_id) | |
| # Save the notebook string to a file | |
| with open(out_path, 'w') as f: | |
| f.write(notebook_string) | |
| notebook_parsed = nbformat.reads(notebook_string, as_version=4) | |
| (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed) | |
| notebook_body = embed_figures(notebook_body, resources) | |
| print("Resources:", resources["outputs"].keys()) | |
| return notebook_body, out_path | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Kaggle Notebooks") | |
| filter_options = gr.Radio(["none",">10KB", ">100KB", ">1MB", ">10MB"], value="none", label="Notebook filters", info="A lot of notebooks are short or have the outputs stripped - filters help finding interesting ones."), | |
| button = gr.Button("Show next!") | |
| file = gr.File() | |
| html = gr.HTML("") | |
| button.click(fn=parse_notebook, inputs=filter_options, outputs=[html, file]) | |
| demo.load(fn=parse_notebook, inputs=filter_options, outputs=[html, file]) | |
| demo.launch() |