File size: 3,250 Bytes
7924d77
1f9efe8
 
 
3bd4bd2
415d76d
 
52cb009
3bd4bd2
415d76d
3bd4bd2
 
 
7924d77
1f9efe8
 
 
415d76d
 
 
 
 
 
 
52cb009
 
 
 
 
 
 
 
6a942c7
415d76d
52cb009
6a942c7
 
52cb009
6a942c7
 
 
52cb009
 
 
 
ac98601
52cb009
9223ca1
 
 
ac98601
 
 
9223ca1
 
 
 
 
 
 
ac98601
 
 
9223ca1
1decac6
52cb009
a386c0e
415d76d
 
a386c0e
415d76d
 
1f9efe8
 
52cb009
 
a386c0e
1f9efe8
 
 
 
9223ca1
ac98601
52cb009
 
 
a386c0e
415d76d
a863009
 
415d76d
1f9efe8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from datasets import load_dataset
import nbformat
from nbconvert import HTMLExporter
from traitlets.config import Config
import os
import shutil
import base64

# Configuration for HTMLExporter
config = Config()
config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"]
html_exporter = HTMLExporter(config=config, template_name="classic")

ds = load_dataset("data-agents/kaggle-notebooks", split="train", streaming=True)
ds_iter = iter(ds)

TMP_DIR = './tmp/'

def reset_tmp_folder():
    if os.path.exists(TMP_DIR):
        shutil.rmtree(TMP_DIR)
    os.makedirs(TMP_DIR)


def embed_figures(html_body, resources):
    for key, value in resources['outputs'].items():
        b64_figure = base64.b64encode(value).decode('utf-8')
        img_tag = f'data:image/png;base64,{b64_figure}'
        html_body = html_body.replace(key, img_tag)
    return html_body

def parse_notebook(filter_options, progress=gr.Progress()):
    reset_tmp_folder()
    found_notebook = False

    counter = 0
    while not found_notebook:
        progress((counter, None), desc="Parsing and filtering notebooks...", unit="steps")
        counter += 1
        
        notebook_data = next(ds_iter)
        notebook_string = notebook_data["text"]
        notebook_id = notebook_data["id"].split("/")[-1]

        if filter_options == "none":
            found_notebook = True
        elif filter_options == ">10MB":
            if len(notebook_string)>10 * 1024 * 1024:
                found_notebook = True
        elif filter_options == ">1MB":
            if len(notebook_string)>1 * 1024 * 1024:
                found_notebook = True
        elif filter_options == ">100KB":
            if len(notebook_string)>100 * 1024:
                found_notebook = True
        elif filter_options == ">10KB":
            if len(notebook_string)>10 * 1024:
                found_notebook = True
        # too slow
        elif filter_options == "has outputs":
            notebook_parsed = nbformat.reads(notebook_string, as_version=4)
            (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
            if len(resources["outputs"])>0:
                found_notebook = True

    out_path = os.path.join(TMP_DIR, notebook_id)
    
    # Save the notebook string to a file
    with open(out_path, 'w') as f:
        f.write(notebook_string)
    
    notebook_parsed = nbformat.reads(notebook_string, as_version=4)
    (notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
    notebook_body = embed_figures(notebook_body, resources)
    print("Resources:", resources["outputs"].keys())
    return notebook_body, out_path


with gr.Blocks() as demo:
    gr.Markdown("# Kaggle Notebooks")
    filter_options = gr.Radio(["none",">10KB", ">100KB", ">1MB", ">10MB"], value="none", label="Notebook filters", info="A lot of notebooks are short or have the outputs stripped - filters help finding interesting ones."),

    button = gr.Button("Show next!")
    file = gr.File()
    html = gr.HTML("")

    
    button.click(fn=parse_notebook, inputs=filter_options, outputs=[html, file])
    demo.load(fn=parse_notebook, inputs=filter_options, outputs=[html, file])

demo.launch()