Spaces:

lvwerra
/

jupyter-gh-viewer-edu

Running

App Files Files Community

jupyter-gh-viewer-edu / app.py

lvwerra HF Staff

progress

6a942c7 over 1 year ago

raw

history blame

3.25 kB

	import gradio as gr
	from datasets import load_dataset
	import nbformat
	from nbconvert import HTMLExporter
	from traitlets.config import Config
	import os
	import shutil
	import base64

	# Configuration for HTMLExporter
	config = Config()
	config.HTMLExporter.preprocessors = ["nbconvert.preprocessors.ExtractOutputPreprocessor"]
	html_exporter = HTMLExporter(config=config, template_name="classic")

	ds = load_dataset("data-agents/kaggle-notebooks", split="train", streaming=True)
	ds_iter = iter(ds)

	TMP_DIR = './tmp/'

	def reset_tmp_folder():
	if os.path.exists(TMP_DIR):
	shutil.rmtree(TMP_DIR)
	os.makedirs(TMP_DIR)


	def embed_figures(html_body, resources):
	for key, value in resources['outputs'].items():
	b64_figure = base64.b64encode(value).decode('utf-8')
	img_tag = f'data:image/png;base64,{b64_figure}'
	html_body = html_body.replace(key, img_tag)
	return html_body

	def parse_notebook(filter_options, progress=gr.Progress()):
	reset_tmp_folder()
	found_notebook = False

	counter = 0
	while not found_notebook:
	progress((counter, None), desc="Parsing and filtering notebooks...", unit="steps")
	counter += 1

	notebook_data = next(ds_iter)
	notebook_string = notebook_data["text"]
	notebook_id = notebook_data["id"].split("/")[-1]

	if filter_options == "none":
	found_notebook = True
	elif filter_options == ">10MB":
	if len(notebook_string)>10 * 1024 * 1024:
	found_notebook = True
	elif filter_options == ">1MB":
	if len(notebook_string)>1 * 1024 * 1024:
	found_notebook = True
	elif filter_options == ">100KB":
	if len(notebook_string)>100 * 1024:
	found_notebook = True
	elif filter_options == ">10KB":
	if len(notebook_string)>10 * 1024:
	found_notebook = True
	# too slow
	elif filter_options == "has outputs":
	notebook_parsed = nbformat.reads(notebook_string, as_version=4)
	(notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
	if len(resources["outputs"])>0:
	found_notebook = True

	out_path = os.path.join(TMP_DIR, notebook_id)

	# Save the notebook string to a file
	with open(out_path, 'w') as f:
	f.write(notebook_string)

	notebook_parsed = nbformat.reads(notebook_string, as_version=4)
	(notebook_body, resources) = html_exporter.from_notebook_node(notebook_parsed)
	notebook_body = embed_figures(notebook_body, resources)
	print("Resources:", resources["outputs"].keys())
	return notebook_body, out_path


	with gr.Blocks() as demo:
	gr.Markdown("# Kaggle Notebooks")
	filter_options = gr.Radio(["none",">10KB", ">100KB", ">1MB", ">10MB"], value="none", label="Notebook filters", info="A lot of notebooks are short or have the outputs stripped - filters help finding interesting ones."),

	button = gr.Button("Show next!")
	file = gr.File()
	html = gr.HTML("")


	button.click(fn=parse_notebook, inputs=filter_options, outputs=[html, file])
	demo.load(fn=parse_notebook, inputs=filter_options, outputs=[html, file])

	demo.launch()