Spaces:

JeffMII
/

CEC-Learning

Sleeping

Jeff Myers II commited on Jun 17

Commit

297702e

1 Parent(s): 5278642

Using quantization_config instead of load_in_8bit

Files changed (1) hide show

Gemma.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import login
 import spaces
 import torch
@@ -11,8 +12,14 @@ class GemmaLLM:
     def __init__(self):
         login(token=os.environ.get("GEMMA_TOKEN"))
         model_id = "google/gemma-3-4b-it"
-        model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         self.model = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16)

 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import quantization_config
 from huggingface_hub import login
 import spaces
 import torch
     def __init__(self):
         login(token=os.environ.get("GEMMA_TOKEN"))
+        quant_config = quantization_config.BitsAndBytesConfig(
+            load_in_8bit=True,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+        )
         model_id = "google/gemma-3-4b-it"
+        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quant_config)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         self.model = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16)