Embedding Models
Collection
5 items
β’
Updated
We introduce Elbedding, TBD
For more technical details, refer to our paper: TBD
from typing import List
from transformers import AutoTokenizer, AutoModel
import torch
def get_detailed_instruct(queries: List[str]) -> List[str]:
return [f"Instruct: Retrieve semantically similar text.\nQuery: {query}" for query in queries]
def tokenize(sentences: List[str], tokenizer: AutoTokenizer):
texts = [x + tokenizer.eos_token for x in sentences]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to("cuda")
inputs.input_ids[:, -1] = tokenizer.eos_token_id
inputs.pop("token_type_ids", None)
return inputs
def pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor, do_normalize: bool = True) -> torch.Tensor:
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
if left_padding:
embeddings = last_hidden_state[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_state.shape[0]
embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device).long(), sequence_lengths.long()]
if do_normalize:
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings
model = AutoModel.from_pretrained(pretrained_model_name_or_path="lamarr-llm-development/elbedding", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="lamarr-llm-development/elbedding", trust_remote_code=True)
model = model.to("cuda")
sentences = ["Hi how are you doing?"]
# sentences = get_detailed_instruct(sentences) # if the sentence is a query
sentences_inputs = tokenize(sentences=sentences, tokenizer=tokenizer)
sentences_outputs = model(**sentences_inputs)
embeddings = pool(
last_hidden_state=sentences_outputs.last_hidden_state,
attention_mask=sentences_inputs.attention_mask,
)
print(embeddings)
from sentence_transformers import SentenceTransformer
from typing import List
def get_detailed_instruct(queries: List[str]) -> List[str]:
return [f"Instruct: Retrieve semantically similar text.\nQuery: {query}" for query in queries]
model = SentenceTransformer("lamarr-llm-development/elbedding", trust_remote_code=True)
# sentences = get_detailed_instruct(sentences) # if the sentence is a query
sentences = ["Hi how are you doing?"]
embeddings = model.encode(sentences=sentences, normalize_embeddings=True)
print(embeddings)
TBD
TBD
Do I need to add instructions to the query?
Yes, this is how the model is trained, otherwise you will see a performance degradation. On the other hand, there is no need to add instructions to the document side.
TBD
TBD