| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | import gradio as gr |
| | import PyPDF2 |
| | import re |
| | import json |
| | from typing import List, Dict |
| | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| | import torch |
| | import tempfile |
| | import os |
| |
|
| | |
| | print("Loading models... This may take a minute on first run.") |
| |
|
| | model_name = "valhalla/t5-small-qg-hl" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
| |
|
| | |
| | model.eval() |
| | device = torch.device("cpu") |
| | model.to(device) |
| |
|
| | def extract_key_phrases(text: str) -> List[str]: |
| | """Extract potential answer candidates from text.""" |
| | |
| | candidates = [] |
| | |
| | |
| | capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) |
| | candidates.extend(capitalized[:3]) |
| | |
| | |
| | |
| | concept_patterns = [ |
| | r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})', |
| | r'(?:known as|called|termed|referred to as) ([^,.]{5,40})', |
| | r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)', |
| | ] |
| | |
| | for pattern in concept_patterns: |
| | matches = re.findall(pattern, text, re.IGNORECASE) |
| | candidates.extend(matches[:2]) |
| | |
| | |
| | candidates = [c.strip() for c in candidates if len(c.strip()) > 5] |
| | return list(dict.fromkeys(candidates))[:5] |
| |
|
| | def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str: |
| | """Generate a question using T5 model with specified type.""" |
| | try: |
| | |
| | input_text = f"generate question: <hl> {answer} <hl> {context}" |
| | |
| | |
| | inputs = tokenizer( |
| | input_text, |
| | return_tensors="pt", |
| | max_length=512, |
| | truncation=True, |
| | padding=True |
| | ).to(device) |
| | |
| | |
| | temperature = 0.7 if question_type == "what" else 0.85 |
| | num_beams = 4 if question_type == "what" else 5 |
| | |
| | |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_length=max_length, |
| | num_beams=num_beams, |
| | early_stopping=True, |
| | do_sample=True, |
| | temperature=temperature |
| | ) |
| | |
| | |
| | question = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | |
| | |
| | question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip() |
| | |
| | |
| | question = improve_question(question, answer, context, question_type) |
| | |
| | return question if len(question) > 10 else "" |
| | |
| | except Exception as e: |
| | print(f"Error generating question: {e}") |
| | return "" |
| |
|
| | def improve_question(question: str, answer: str, context: str, question_type: str) -> str: |
| | """Post-process generated questions to improve quality and add variety.""" |
| | |
| | |
| | if not question.endswith('?'): |
| | question = question.rstrip('.') + '?' |
| | |
| | |
| | question = question[0].upper() + question[1:] if question else question |
| | |
| | |
| | if question_type == "why" and not question.lower().startswith("why"): |
| | |
| | if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE): |
| | question = create_why_question(question, answer, context) |
| | |
| | elif question_type == "how" and not question.lower().startswith("how"): |
| | |
| | if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE): |
| | question = create_how_question(question, answer, context) |
| | |
| | return question |
| |
|
| | def create_why_question(base_question: str, answer: str, context: str) -> str: |
| | """Transform or create a 'why' question.""" |
| | |
| | |
| | causal_patterns = [ |
| | r'because ([^,.]{10,60})', |
| | r'due to ([^,.]{10,60})', |
| | r'as a result of ([^,.]{10,60})', |
| | r'(?:leads to|causes|results in) ([^,.]{10,60})', |
| | r'in order to ([^,.]{10,60})' |
| | ] |
| | |
| | for pattern in causal_patterns: |
| | match = re.search(pattern, context, re.IGNORECASE) |
| | if match: |
| | |
| | subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context) |
| | if subject_match: |
| | subject = subject_match.group(1) |
| | return f"Why does {subject.lower()} occur?" |
| | |
| | |
| | |
| | words = answer.split() |
| | if len(words) > 3: |
| | return f"Why is {' '.join(words[:4])}... important?" |
| | |
| | return base_question |
| |
|
| | def create_how_question(base_question: str, answer: str, context: str) -> str: |
| | """Transform or create a 'how' question.""" |
| | |
| | |
| | process_patterns = [ |
| | r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})', |
| | r'by ([^,.]{10,60})', |
| | r'through ([^,.]{10,60})' |
| | ] |
| | |
| | for pattern in process_patterns: |
| | match = re.search(pattern, context, re.IGNORECASE) |
| | if match: |
| | if len(match.groups()) > 1: |
| | process = match.group(2) |
| | return f"How does {process.lower()} work?" |
| | else: |
| | process = match.group(1) |
| | return f"How is {process.lower()} achieved?" |
| | |
| | |
| | verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE) |
| | if verbs: |
| | subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE) |
| | if subject_match: |
| | subject = subject_match.group(1) |
| | return f"How does {subject.lower()} {verbs[0].lower()}?" |
| | |
| | return base_question |
| |
|
| | def extract_text_from_pdf(pdf_file) -> str: |
| | """Extract text from uploaded PDF file.""" |
| | text = "" |
| | try: |
| | if isinstance(pdf_file, str): |
| | pdf_reader = PyPDF2.PdfReader(pdf_file) |
| | else: |
| | pdf_reader = PyPDF2.PdfReader(pdf_file) |
| | |
| | for page in pdf_reader.pages: |
| | page_text = page.extract_text() |
| | if page_text: |
| | text += page_text + "\n" |
| | except Exception as e: |
| | return f"Error reading PDF: {str(e)}" |
| | |
| | return text |
| |
|
| | def clean_text(text: str) -> str: |
| | """Clean and preprocess extracted text.""" |
| | |
| | text = re.sub(r'\s+', ' ', text) |
| | |
| | text = re.sub(r'[^\w\s.,;!?-]', '', text) |
| | return text.strip() |
| |
|
| | def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]: |
| | """Split text into overlapping chunks for processing.""" |
| | sentences = re.split(r'(?<=[.!?])\s+', text) |
| | chunks = [] |
| | current_chunk = "" |
| | |
| | for sentence in sentences: |
| | if len(current_chunk) + len(sentence) < max_chunk_size: |
| | current_chunk += " " + sentence |
| | else: |
| | if current_chunk: |
| | chunks.append(current_chunk.strip()) |
| | current_chunk = sentence |
| | |
| | if current_chunk: |
| | chunks.append(current_chunk.strip()) |
| | |
| | |
| | overlapped_chunks = [] |
| | for i, chunk in enumerate(chunks): |
| | if i > 0 and overlap > 0: |
| | prev_sentences = chunks[i-1].split('. ') |
| | overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:] |
| | chunk = overlap_text + " " + chunk |
| | overlapped_chunks.append(chunk) |
| | |
| | return overlapped_chunks |
| |
|
| | def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]: |
| | """Generate question-answer pairs from a text chunk with variety.""" |
| | flashcards = [] |
| | |
| | |
| | words = chunk.split() |
| | if len(words) < 20: |
| | return [] |
| | |
| | try: |
| | |
| | key_phrases = extract_key_phrases(chunk) |
| | |
| | |
| | sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20] |
| | |
| | |
| | answer_candidates = key_phrases + sentences[:2] |
| | |
| | if len(answer_candidates) < 1: |
| | return [] |
| | |
| | |
| | question_types = ["what", "why", "how"] |
| | |
| | |
| | questions_generated = 0 |
| | for i, answer in enumerate(answer_candidates): |
| | if questions_generated >= num_questions: |
| | break |
| | |
| | |
| | if len(answer.split()) < 3: |
| | continue |
| | |
| | |
| | q_type = question_types[i % len(question_types)] |
| | |
| | question = generate_questions(chunk, answer, question_type=q_type) |
| | |
| | if question and question != answer: |
| | flashcards.append({ |
| | "question": question, |
| | "answer": answer, |
| | "context": chunk[:200] + "..." if len(chunk) > 200 else chunk, |
| | "type": q_type |
| | }) |
| | questions_generated += 1 |
| | |
| | except Exception as e: |
| | print(f"Error generating QA: {e}") |
| | |
| | return flashcards |
| |
|
| | def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20): |
| | """Main processing function.""" |
| | if pdf_file is None: |
| | return "Please upload a PDF file.", "", "", "Your flashcards will appear here..." |
| | |
| | try: |
| | |
| | yield "π Extracting text from PDF...", "", "", "Processing..." |
| | raw_text = extract_text_from_pdf(pdf_file) |
| | |
| | if raw_text.startswith("Error"): |
| | yield raw_text, "", "", "Error occurred" |
| | return |
| | |
| | if len(raw_text.strip()) < 100: |
| | yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred" |
| | return |
| | |
| | |
| | yield "π§Ή Cleaning text...", "", "", "Processing..." |
| | cleaned_text = clean_text(raw_text) |
| | |
| | |
| | yield "βοΈ Chunking text into sections...", "", "", "Processing..." |
| | chunks = chunk_text(cleaned_text) |
| | |
| | |
| | chunks = chunks[:max_chunks] |
| | |
| | |
| | all_flashcards = [] |
| | total_chunks = len(chunks) |
| | |
| | for i, chunk in enumerate(chunks): |
| | progress = f"π΄ Generating flashcards... ({i+1}/{total_chunks} chunks processed)" |
| | yield progress, "", "", "Processing..." |
| | |
| | cards = generate_qa_pairs(chunk, questions_per_chunk) |
| | all_flashcards.extend(cards) |
| | |
| | if not all_flashcards: |
| | yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated" |
| | return |
| | |
| | |
| | yield "β
Finalizing...", "", "", "Almost done..." |
| | |
| | |
| | display_text = format_flashcards_display(all_flashcards) |
| | |
| | |
| | json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False) |
| | |
| | |
| | csv_lines = ["Question,Answer,Type"] |
| | for card in all_flashcards: |
| | q = card['question'].replace('"', '""') |
| | a = card['answer'].replace('"', '""') |
| | t = card.get('type', 'what') |
| | csv_lines.append(f'"{q}","{a}","{t}"') |
| | csv_output = "\n".join(csv_lines) |
| | |
| | |
| | stats = f"β
Done! Generated {len(all_flashcards)} flashcards (" |
| | types_count = {} |
| | for card in all_flashcards: |
| | t = card.get('type', 'what') |
| | types_count[t] = types_count.get(t, 0) + 1 |
| | stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")" |
| | |
| | yield stats, csv_output, json_output, display_text |
| | |
| | except Exception as e: |
| | error_msg = f"Error processing PDF: {str(e)}" |
| | print(error_msg) |
| | yield error_msg, "", "", error_msg |
| |
|
| | def format_flashcards_display(flashcards: List[Dict]) -> str: |
| | """Format flashcards for nice display.""" |
| | lines = [f"## π΄ Generated {len(flashcards)} Flashcards\n"] |
| | |
| | |
| | types_count = {} |
| | for card in flashcards: |
| | t = card.get('type', 'what') |
| | types_count[t] = types_count.get(t, 0) + 1 |
| | |
| | lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n") |
| | lines.append("---\n") |
| | |
| | for i, card in enumerate(flashcards, 1): |
| | qtype = card.get('type', 'what').upper() |
| | emoji = "β" if qtype == "WHAT" else "π€" if qtype == "WHY" else "π§" |
| | |
| | lines.append(f"### {emoji} Card {i} - {qtype}") |
| | lines.append(f"**Q:** {card['question']}") |
| | lines.append(f"**A:** {card['answer']}") |
| | lines.append(f"*Context: {card['context'][:100]}...*\n") |
| | lines.append("---\n") |
| | |
| | return "\n".join(lines) |
| |
|
| | def create_sample_flashcard(): |
| | """Create a sample flashcard for demo purposes.""" |
| | sample = [ |
| | { |
| | "question": "What is photosynthesis?", |
| | "answer": "Photosynthesis is the process by which plants convert sunlight into energy.", |
| | "context": "Photosynthesis is the process by which plants convert sunlight into energy...", |
| | "type": "what" |
| | }, |
| | { |
| | "question": "Why do plants need chlorophyll?", |
| | "answer": "Chlorophyll absorbs light energy needed for photosynthesis.", |
| | "context": "Chlorophyll absorbs light energy needed for photosynthesis...", |
| | "type": "why" |
| | }, |
| | { |
| | "question": "How do plants convert light into chemical energy?", |
| | "answer": "Through the process of photosynthesis in the chloroplasts.", |
| | "context": "Through the process of photosynthesis in the chloroplasts...", |
| | "type": "how" |
| | } |
| | ] |
| | return format_flashcards_display(sample) |
| |
|
| | |
| | custom_css = """ |
| | .flashcard-container { |
| | border: 2px solid #e0e0e0; |
| | border-radius: 10px; |
| | padding: 20px; |
| | margin: 10px 0; |
| | background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| | color: white; |
| | } |
| | .question { |
| | font-size: 1.2em; |
| | font-weight: bold; |
| | margin-bottom: 10px; |
| | } |
| | .answer { |
| | font-size: 1em; |
| | opacity: 0.9; |
| | } |
| | """ |
| |
|
| | |
| | with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo: |
| | gr.Markdown(""" |
| | # π PDF to Flashcards Generator (Enhanced) |
| | |
| | Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI. |
| | |
| | **β¨ New Features:** |
| | - π― Generates **What** questions (factual) |
| | - π€ Generates **Why** questions (reasoning) |
| | - π§ Generates **How** questions (process) |
| | - π Improved question quality and variety |
| | - π§ Better answer extraction |
| | |
| | **Core Features:** |
| | - π§ Uses local CPU-friendly AI (no GPU needed) |
| | - π Extracts text from any PDF |
| | - βοΈ Intelligently chunks content |
| | - π΄ Generates diverse question-answer pairs |
| | - πΎ Export to CSV (Anki-compatible) or JSON |
| | |
| | *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.* |
| | """) |
| | |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | pdf_input = gr.File( |
| | label="Upload PDF", |
| | file_types=[".pdf"], |
| | type="filepath" |
| | ) |
| | |
| | with gr.Row(): |
| | questions_per_chunk = gr.Slider( |
| | minimum=1, |
| | maximum=6, |
| | value=3, |
| | step=1, |
| | label="Questions per section" |
| | ) |
| | max_chunks = gr.Slider( |
| | minimum=5, |
| | maximum=50, |
| | value=20, |
| | step=5, |
| | label="Max sections to process" |
| | ) |
| | |
| | process_btn = gr.Button("π Generate Flashcards", variant="primary") |
| | |
| | gr.Markdown(""" |
| | ### π‘ Tips: |
| | - Text-based PDFs work best (scanned images won't work) |
| | - Academic papers and articles work great |
| | - Adjust "Questions per section" for more variety |
| | - Higher questions per section = more Why/How questions |
| | """) |
| | |
| | with gr.Column(scale=2): |
| | status_text = gr.Textbox( |
| | label="Status", |
| | value="Ready to process PDF...", |
| | interactive=False |
| | ) |
| | |
| | output_display = gr.Markdown( |
| | label="Generated Flashcards", |
| | value="Your flashcards will appear here..." |
| | ) |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | csv_output = gr.Textbox( |
| | label="CSV Format (for Anki import)", |
| | lines=10, |
| | visible=True |
| | ) |
| | gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*") |
| | |
| | with gr.Column(): |
| | json_output = gr.Textbox( |
| | label="JSON Format", |
| | lines=10, |
| | visible=True |
| | ) |
| | gr.Markdown("*Raw JSON data for custom applications*") |
| | |
| | |
| | process_btn.click( |
| | fn=process_pdf, |
| | inputs=[pdf_input, questions_per_chunk, max_chunks], |
| | outputs=[status_text, csv_output, json_output, output_display] |
| | ) |
| | |
| | |
| | gr.Markdown("---") |
| | gr.Markdown("### π― Example Output Format") |
| | gr.Markdown(create_sample_flashcard()) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |