Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,6 +30,7 @@ import subprocess
|
|
| 30 |
import pytesseract
|
| 31 |
from pdf2image import convert_from_path
|
| 32 |
import queue # ์ถ๊ฐ: queue.Empty ์์ธ ์ฒ๋ฆฌ๋ฅผ ์ํด
|
|
|
|
| 33 |
|
| 34 |
# -------------------- ์ถ๊ฐ: PDF to Markdown ๋ณํ ๊ด๋ จ import --------------------
|
| 35 |
try:
|
|
@@ -545,10 +546,15 @@ def clear_cuda_memory():
|
|
| 545 |
@spaces.GPU
|
| 546 |
def load_model():
|
| 547 |
try:
|
|
|
|
|
|
|
|
|
|
| 548 |
loaded_model = AutoModelForCausalLM.from_pretrained(
|
| 549 |
MODEL_ID,
|
| 550 |
torch_dtype=torch.bfloat16,
|
| 551 |
device_map="auto",
|
|
|
|
|
|
|
| 552 |
)
|
| 553 |
return loaded_model
|
| 554 |
except Exception as e:
|
|
@@ -628,19 +634,22 @@ def stream_chat(
|
|
| 628 |
if len(history) > max_history_length:
|
| 629 |
history = history[-max_history_length:]
|
| 630 |
|
|
|
|
|
|
|
| 631 |
try:
|
| 632 |
relevant_contexts = find_relevant_context(message)
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
|
|
|
| 640 |
except Exception as e:
|
| 641 |
print(f"์ปจํ
์คํธ ๊ฒ์ ์ค๋ฅ: {str(e)}")
|
| 642 |
-
wiki_context = ""
|
| 643 |
|
|
|
|
| 644 |
conversation = []
|
| 645 |
for prompt, answer in history:
|
| 646 |
conversation.extend([
|
|
@@ -648,43 +657,61 @@ def stream_chat(
|
|
| 648 |
{"role": "assistant", "content": answer}
|
| 649 |
])
|
| 650 |
|
| 651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
conversation.append({"role": "user", "content": final_message})
|
| 653 |
|
|
|
|
| 654 |
input_ids_str = build_prompt(conversation)
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
|
| 659 |
max_context = 8192
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
|
|
|
|
|
|
| 666 |
new_desired_input_length = max_context - min_generation
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
if remaining < max_new_tokens:
|
| 679 |
-
print(f"
|
| 680 |
max_new_tokens = remaining
|
| 681 |
|
| 682 |
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
| 683 |
|
|
|
|
| 684 |
streamer = TextIteratorStreamer(
|
| 685 |
-
tokenizer, timeout=
|
| 686 |
)
|
| 687 |
|
|
|
|
| 688 |
generate_kwargs = dict(
|
| 689 |
**inputs,
|
| 690 |
streamer=streamer,
|
|
@@ -694,23 +721,51 @@ def stream_chat(
|
|
| 694 |
max_new_tokens=max_new_tokens,
|
| 695 |
do_sample=True,
|
| 696 |
temperature=temperature,
|
| 697 |
-
eos_token_id=
|
| 698 |
)
|
| 699 |
|
|
|
|
| 700 |
clear_cuda_memory()
|
| 701 |
|
|
|
|
| 702 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 703 |
thread.start()
|
| 704 |
|
|
|
|
| 705 |
buffer = ""
|
|
|
|
|
|
|
|
|
|
| 706 |
try:
|
| 707 |
for new_text in streamer:
|
| 708 |
buffer += new_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
yield "", history + [[message, buffer]]
|
| 710 |
-
|
| 711 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
yield "", history + [[message, buffer]]
|
| 713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
clear_cuda_memory()
|
| 715 |
|
| 716 |
except Exception as e:
|
|
@@ -825,6 +880,10 @@ def create_demo():
|
|
| 825 |
)
|
| 826 |
|
| 827 |
file_upload.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
fn=init_msg,
|
| 829 |
outputs=msg,
|
| 830 |
queue=False
|
|
@@ -846,4 +905,4 @@ def create_demo():
|
|
| 846 |
|
| 847 |
if __name__ == "__main__":
|
| 848 |
demo = create_demo()
|
| 849 |
-
demo.launch()
|
|
|
|
| 30 |
import pytesseract
|
| 31 |
from pdf2image import convert_from_path
|
| 32 |
import queue # ์ถ๊ฐ: queue.Empty ์์ธ ์ฒ๋ฆฌ๋ฅผ ์ํด
|
| 33 |
+
import time # ์ถ๊ฐ: ์คํธ๋ฆฌ๋ฐ ํ์ด๋ฐ์ ์ํด
|
| 34 |
|
| 35 |
# -------------------- ์ถ๊ฐ: PDF to Markdown ๋ณํ ๊ด๋ จ import --------------------
|
| 36 |
try:
|
|
|
|
| 546 |
@spaces.GPU
|
| 547 |
def load_model():
|
| 548 |
try:
|
| 549 |
+
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ ๋จผ์ ์ํ
|
| 550 |
+
clear_cuda_memory()
|
| 551 |
+
|
| 552 |
loaded_model = AutoModelForCausalLM.from_pretrained(
|
| 553 |
MODEL_ID,
|
| 554 |
torch_dtype=torch.bfloat16,
|
| 555 |
device_map="auto",
|
| 556 |
+
# ๋ฎ์ ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ์ ์ํ ์ค์ ์ถ๊ฐ
|
| 557 |
+
low_cpu_mem_usage=True,
|
| 558 |
)
|
| 559 |
return loaded_model
|
| 560 |
except Exception as e:
|
|
|
|
| 634 |
if len(history) > max_history_length:
|
| 635 |
history = history[-max_history_length:]
|
| 636 |
|
| 637 |
+
# ์ํคํผ๋์ ์ปจํ
์คํธ ๊ฒ์
|
| 638 |
+
wiki_context = ""
|
| 639 |
try:
|
| 640 |
relevant_contexts = find_relevant_context(message)
|
| 641 |
+
if relevant_contexts: # ๊ฒฐ๊ณผ๊ฐ ์์ ๊ฒฝ์ฐ๋ง ์ถ๊ฐ
|
| 642 |
+
wiki_context = "\n\n๊ด๋ จ ์ํคํผ๋์ ์ ๋ณด:\n"
|
| 643 |
+
for ctx in relevant_contexts:
|
| 644 |
+
wiki_context += (
|
| 645 |
+
f"Q: {ctx['question']}\n"
|
| 646 |
+
f"A: {ctx['answer']}\n"
|
| 647 |
+
f"์ ์ฌ๋: {ctx['similarity']:.3f}\n\n"
|
| 648 |
+
)
|
| 649 |
except Exception as e:
|
| 650 |
print(f"์ปจํ
์คํธ ๊ฒ์ ์ค๋ฅ: {str(e)}")
|
|
|
|
| 651 |
|
| 652 |
+
# ๋ํ ๋ด์ญ ๊ตฌ์ฑ
|
| 653 |
conversation = []
|
| 654 |
for prompt, answer in history:
|
| 655 |
conversation.extend([
|
|
|
|
| 657 |
{"role": "assistant", "content": answer}
|
| 658 |
])
|
| 659 |
|
| 660 |
+
# ์ต์ข
๋ฉ์์ง ๊ตฌ์ฑ
|
| 661 |
+
final_message = message
|
| 662 |
+
if file_context:
|
| 663 |
+
final_message = file_context + "\nํ์ฌ ์ง๋ฌธ: " + message
|
| 664 |
+
if wiki_context:
|
| 665 |
+
final_message = wiki_context + "\nํ์ฌ ์ง๋ฌธ: " + message
|
| 666 |
+
if file_context and wiki_context:
|
| 667 |
+
final_message = file_context + wiki_context + "\nํ์ฌ ์ง๋ฌธ: " + message
|
| 668 |
+
|
| 669 |
conversation.append({"role": "user", "content": final_message})
|
| 670 |
|
| 671 |
+
# ํ๋กฌํํธ ๊ตฌ์ฑ ๋ฐ ํ ํฐํ
|
| 672 |
input_ids_str = build_prompt(conversation)
|
| 673 |
+
|
| 674 |
+
# ๋จผ์ ์ปจํ
์คํธ ๊ธธ์ด ํ์ธ ๋ฐ ์ ํ
|
|
|
|
|
|
|
| 675 |
max_context = 8192
|
| 676 |
+
tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
|
| 677 |
+
input_length = tokenized_input["input_ids"].shape[1]
|
| 678 |
+
|
| 679 |
+
# ์ปจํ
์คํธ๊ฐ ๋๋ฌด ๊ธธ๋ฉด ์๋ฅด๊ธฐ
|
| 680 |
+
if input_length > max_context - max_new_tokens:
|
| 681 |
+
print(f"์
๋ ฅ์ด ๋๋ฌด ๊น๋๋ค: {input_length} ํ ํฐ. ์๋ฅด๋ ์ค...")
|
| 682 |
+
# ์ต์ ์์ฑ ํ ํฐ ์ ํ๋ณด
|
| 683 |
+
min_generation = min(256, max_new_tokens)
|
| 684 |
new_desired_input_length = max_context - min_generation
|
| 685 |
+
|
| 686 |
+
# ์
๋ ฅ ํ
์คํธ๋ฅผ ํ ํฐ ๋จ์๋ก ์๋ฅด๊ธฐ
|
| 687 |
+
tokens = tokenizer.encode(input_ids_str)
|
| 688 |
+
if len(tokens) > new_desired_input_length:
|
| 689 |
+
tokens = tokens[-new_desired_input_length:]
|
| 690 |
+
input_ids_str = tokenizer.decode(tokens)
|
| 691 |
+
|
| 692 |
+
# ๋ค์ ํ ํฐํ
|
| 693 |
+
tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
|
| 694 |
+
input_length = tokenized_input["input_ids"].shape[1]
|
| 695 |
+
|
| 696 |
+
print(f"์ต์ข
์
๋ ฅ ๊ธธ์ด: {input_length} ํ ํฐ")
|
| 697 |
+
|
| 698 |
+
# CUDA๋ก ์
๋ ฅ ์ด๋
|
| 699 |
+
inputs = tokenized_input.to("cuda")
|
| 700 |
+
|
| 701 |
+
# ๋จ์ ํ ํฐ ์ ๊ณ์ฐ ๋ฐ max_new_tokens ์กฐ์
|
| 702 |
+
remaining = max_context - input_length
|
| 703 |
if remaining < max_new_tokens:
|
| 704 |
+
print(f"max_new_tokens ์กฐ์ : {max_new_tokens} -> {remaining}")
|
| 705 |
max_new_tokens = remaining
|
| 706 |
|
| 707 |
print(f"์
๋ ฅ ํ
์ ์์ฑ ํ CUDA ๋ฉ๋ชจ๋ฆฌ: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
|
| 708 |
|
| 709 |
+
# ์คํธ๋ฆฌ๋จธ ์ค์
|
| 710 |
streamer = TextIteratorStreamer(
|
| 711 |
+
tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
|
| 712 |
)
|
| 713 |
|
| 714 |
+
# ์์ฑ ๋งค๊ฐ๋ณ์ ์ค์
|
| 715 |
generate_kwargs = dict(
|
| 716 |
**inputs,
|
| 717 |
streamer=streamer,
|
|
|
|
| 721 |
max_new_tokens=max_new_tokens,
|
| 722 |
do_sample=True,
|
| 723 |
temperature=temperature,
|
| 724 |
+
eos_token_id=tokenizer.eos_token_id, # ๋ช
์์ EOS ํ ํฐ ์ง์
|
| 725 |
)
|
| 726 |
|
| 727 |
+
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
| 728 |
clear_cuda_memory()
|
| 729 |
|
| 730 |
+
# ๋ณ๋ ์ค๋ ๋์์ ์์ฑ ์คํ
|
| 731 |
thread = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 732 |
thread.start()
|
| 733 |
|
| 734 |
+
# ์๋ต ์คํธ๋ฆฌ๋ฐ
|
| 735 |
buffer = ""
|
| 736 |
+
partial_message = ""
|
| 737 |
+
last_yield_time = time.time()
|
| 738 |
+
|
| 739 |
try:
|
| 740 |
for new_text in streamer:
|
| 741 |
buffer += new_text
|
| 742 |
+
partial_message += new_text
|
| 743 |
+
|
| 744 |
+
# ์ผ์ ์๊ฐ๋ง๋ค ๋๋ ํ
์คํธ๊ฐ ์์ผ ๋๋ง๋ค ๊ฒฐ๊ณผ ์
๋ฐ์ดํธ
|
| 745 |
+
current_time = time.time()
|
| 746 |
+
if current_time - last_yield_time > 0.1 or len(partial_message) > 20:
|
| 747 |
+
yield "", history + [[message, buffer]]
|
| 748 |
+
partial_message = ""
|
| 749 |
+
last_yield_time = current_time
|
| 750 |
+
|
| 751 |
+
# ๋ง์ง๋ง ์๋ต ํ์ธ
|
| 752 |
+
if buffer:
|
| 753 |
yield "", history + [[message, buffer]]
|
| 754 |
+
|
| 755 |
+
# ๋ํ ๊ธฐ๋ก์ ์ ์ฅ
|
| 756 |
+
chat_history.add_conversation(message, buffer)
|
| 757 |
+
|
| 758 |
+
except Exception as e:
|
| 759 |
+
print(f"์คํธ๋ฆฌ๋ฐ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
| 760 |
+
if not buffer: # ๋ฒํผ๊ฐ ๋น์ด์์ผ๋ฉด ์ค๋ฅ ๋ฉ์์ง ํ์
|
| 761 |
+
buffer = f"์๋ต ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
| 762 |
yield "", history + [[message, buffer]]
|
| 763 |
+
|
| 764 |
+
# ์ค๋ ๋๊ฐ ์ฌ์ ํ ์คํ ์ค์ด๋ฉด ์ข
๋ฃ ๋๊ธฐ
|
| 765 |
+
if thread.is_alive():
|
| 766 |
+
thread.join(timeout=5.0)
|
| 767 |
+
|
| 768 |
+
# ๋ฉ๋ชจ๋ฆฌ ์ ๋ฆฌ
|
| 769 |
clear_cuda_memory()
|
| 770 |
|
| 771 |
except Exception as e:
|
|
|
|
| 880 |
)
|
| 881 |
|
| 882 |
file_upload.change(
|
| 883 |
+
fn=lambda: ("์ฒ๋ฆฌ ์ค...", [["์์คํ
", "ํ์ผ์ ๋ถ์ ์ค์
๋๋ค. ์ ์๋ง ๊ธฐ๋ค๋ ค์ฃผ์ธ์..."]]),
|
| 884 |
+
outputs=[msg, chatbot],
|
| 885 |
+
queue=False
|
| 886 |
+
).then(
|
| 887 |
fn=init_msg,
|
| 888 |
outputs=msg,
|
| 889 |
queue=False
|
|
|
|
| 905 |
|
| 906 |
if __name__ == "__main__":
|
| 907 |
demo = create_demo()
|
| 908 |
+
demo.launch()
|