Spaces:
Running
Running
File size: 3,250 Bytes
7924d77 1f9efe8 3bd4bd2 415d76d 52cb009 3bd4bd2 415d76d 3bd4bd2 7924d77 1f9efe8 415d76d 52cb009 6a942c7 415d76d 52cb009 6a942c7 52cb009 6a942c7 52cb009 ac98601 52cb009 9223ca1 ac98601 9223ca1 ac98601 9223ca1 1decac6 52cb009 a386c0e 415d76d a386c0e 415d76d 1f9efe8 52cb009 a386c0e 1f9efe8 9223ca1 ac98601 52cb009 a386c0e 415d76d a863009 415d76d 1f9efe8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import gradio as gr
from datasets import load_dataset
import nbformat
from nbconvert import HTMLExporter
from traitlets.config import Config
import os
import shutil
import base64
# Configuration for HTMLExporter
config = Config()
config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"]
html_exporter = HTMLExporter(config=config, template_name="classic")
ds = load_dataset("data-agents/kaggle-notebooks", split="train", streaming=True)
ds_iter = iter(ds)
TMP_DIR = './tmp/'
def reset_tmp_folder():
if os.path.exists(TMP_DIR):
shutil.rmtree(TMP_DIR)
os.makedirs(TMP_DIR)
def embed_figures(html_body, resources):
for key, value in resources['outputs'].items():
b64_figure = base64.b64encode(value).decode('utf-8')
img_tag = f'data:image/png;base64,{b64_figure}'
html_body = html_body.replace(key, img_tag)
return html_body
def parse_notebook(filter_options, progress=gr.Progress()):
reset_tmp_folder()
found_notebook = False
counter = 0
while not found_notebook:
progress((counter, None), desc="Parsing and filtering notebooks...", unit="steps")
counter += 1
notebook_data = next(ds_iter)
notebook_string = notebook_data["text"]
notebook_id = notebook_data["id"].split("/")[-1]
if filter_options == "none":
found_notebook = True
elif filter_options == ">10MB":
if len(notebook_string)>10 * 1024 * 1024:
found_notebook = True
elif filter_options == ">1MB":
if len(notebook_string)>1 * 1024 * 1024:
found_notebook = True
elif filter_options == ">100KB":
if len(notebook_string)>100 * 1024:
found_notebook = True
elif filter_options == ">10KB":
if len(notebook_string)>10 * 1024:
found_notebook = True
# too slow
elif filter_options == "has outputs":
notebook_parsed = nbformat.reads(notebook_string, as_version=4)
(notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
if len(resources["outputs"])>0:
found_notebook = True
out_path = os.path.join(TMP_DIR, notebook_id)
# Save the notebook string to a file
with open(out_path, 'w') as f:
f.write(notebook_string)
notebook_parsed = nbformat.reads(notebook_string, as_version=4)
(notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
notebook_body = embed_figures(notebook_body, resources)
print("Resources:", resources["outputs"].keys())
return notebook_body, out_path
with gr.Blocks() as demo:
gr.Markdown("# Kaggle Notebooks")
filter_options = gr.Radio(["none",">10KB", ">100KB", ">1MB", ">10MB"], value="none", label="Notebook filters", info="A lot of notebooks are short or have the outputs stripped - filters help finding interesting ones."),
button = gr.Button("Show next!")
file = gr.File()
html = gr.HTML("")
button.click(fn=parse_notebook, inputs=filter_options, outputs=[html, file])
demo.load(fn=parse_notebook, inputs=filter_options, outputs=[html, file])
demo.launch() |