Instructor embedders integration for Haystack
Module haystack_integrations.components.embedders.instructor_embedders.instructor_document_embedder
InstructorDocumentEmbedder
A component for computing Document embeddings using INSTRUCTOR embedding models.
The embedding of each Document is stored in the embedding
field of the Document.
Usage example:
# To use this component, install the "instructor-embedders-haystack" package.
# pip install instructor-embedders-haystack
from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder
from haystack.dataclasses import Document
from haystack.utils import ComponentDevice
doc_embedding_instruction = "Represent the Medical Document for retrieval:"
doc_embedder = InstructorDocumentEmbedder(
model="hkunlp/instructor-base",
instruction=doc_embedding_instruction,
batch_size=32,
device=ComponentDevice.from_str("cpu"),
)
doc_embedder.warm_up()
# Text taken from PubMed QA Dataset (https://huggingface.co/datasets/pubmed_qa)
document_list = [
Document(
content="Oxidative stress generated within inflammatory joints can produce autoimmune phenomena and joint destruction. Radical species with oxidative activity, including reactive nitrogen species, represent mediators of inflammation and cartilage damage.",
meta={
"pubid": "25,445,628",
"long_answer": "yes",
},
),
Document(
content="Plasma levels of pancreatic polypeptide (PP) rise upon food intake. Although other pancreatic islet hormones, such as insulin and glucagon, have been extensively investigated, PP secretion and actions are still poorly understood.",
meta={
"pubid": "25,445,712",
"long_answer": "yes",
},
),
]
result = doc_embedder.run(document_list)
print(f"Document Text: {result['documents'][0].content}")
print(f"Document Embedding: {result['documents'][0].embedding}")
print(f"Embedding Dimension: {len(result['documents'][0].embedding)}")
InstructorDocumentEmbedder.__init__
def __init__(model: str = "hkunlp/instructor-base",
device: Optional[ComponentDevice] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN",
strict=False),
instruction: str = "Represent the document",
batch_size: int = 32,
progress_bar: bool = True,
normalize_embeddings: bool = False,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n")
Create an InstructorDocumentEmbedder component.
Arguments:
model
: Local path or name of the model in Hugging Face's model hub, such as'hkunlp/instructor-base'
.device
: The device on which the model is loaded. IfNone
, the default device is automatically selected.token
: An API token used to download private models from Hugging Face. If this parameter is set toTrue
, then the token generated when runningtransformers-cli login
(stored in ~/.huggingface) will be used.instruction
: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where:- "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
- "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
- "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. Check some examples of instructions here.
batch_size
: Number of strings to encode at once.progress_bar
: If true, displays progress bar during embedding.normalize_embeddings
: If set to true, returned vectors will have the length of 1.meta_fields_to_embed
: List of meta fields that should be embedded along with the Document content.embedding_separator
: Separator used to concatenate the meta fields to the Document content.
InstructorDocumentEmbedder.to_dict
def to_dict() -> Dict[str, Any]
Serializes the component to a dictionary.
Returns:
Dictionary with serialized data.
InstructorDocumentEmbedder.from_dict
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "InstructorDocumentEmbedder"
Deserializes the component from a dictionary.
Arguments:
data
: Dictionary to deserialize from.
Returns:
Deserialized component.
InstructorDocumentEmbedder.warm_up
def warm_up()
Initializes the component.
InstructorDocumentEmbedder.run
@component.output_types(documents=List[Document])
def run(documents: List[Document])
Embed a list of Documents. The embedding of each Document is stored in the embedding
field of the Document.
param documents: A list of Documents to embed.
Module haystack_integrations.components.embedders.instructor_embedders.instructor_text_embedder
InstructorTextEmbedder
A component for embedding strings using INSTRUCTOR embedding models.
Usage example:
# To use this component, install the "instructor-embedders-haystack" package.
# pip install instructor-embedders-haystack
from haystack.utils.device import ComponentDevice
from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder
text = ("It clearly says online this will work on a Mac OS system. The disk comes and it does not, only Windows.
"Do Not order this if you have a Mac!!")
instruction = (
"Represent the Amazon comment for classifying the sentence as positive or negative"
)
text_embedder = InstructorTextEmbedder(
model="hkunlp/instructor-base", instruction=instruction,
device=ComponentDevice.from_str("cpu")
)
text_embedder.warm_up()
embedding = text_embedder.run(text)
InstructorTextEmbedder.__init__
def __init__(model: str = "hkunlp/instructor-base",
device: Optional[ComponentDevice] = None,
token: Optional[Secret] = Secret.from_env_var("HF_API_TOKEN",
strict=False),
instruction: str = "Represent the sentence",
batch_size: int = 32,
progress_bar: bool = True,
normalize_embeddings: bool = False)
Create an InstructorTextEmbedder component.
Arguments:
model
: Local path or name of the model in Hugging Face's model hub, such as'hkunlp/instructor-base'
.device
: The device on which the model is loaded. IfNone
, the default device is automatically selected.token
: The API token used to download private models from Hugging Face.instruction
: The instruction string to be used while computing domain-specific embeddings. The instruction follows the unified template of the form: "Represent the 'domain' 'text_type' for 'task_objective'", where:- "domain" is optional, and it specifies the domain of the text, e.g., science, finance, medicine, etc.
- "text_type" is required, and it specifies the encoding unit, e.g., sentence, document, paragraph, etc.
- "task_objective" is optional, and it specifies the objective of embedding, e.g., retrieve a document, classify the sentence, etc. Check some examples of instructions here.
batch_size
: Number of strings to encode at once.progress_bar
: If true, displays progress bar during embedding.normalize_embeddings
: If set to true, returned vectors will have the length of 1.
InstructorTextEmbedder.to_dict
def to_dict() -> Dict[str, Any]
Serializes the component to a dictionary.
Returns:
Dictionary with serialized data.
InstructorTextEmbedder.from_dict
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "InstructorTextEmbedder"
Deserialize this component from a dictionary.
InstructorTextEmbedder.warm_up
def warm_up()
Load the embedding backend.
InstructorTextEmbedder.run
@component.output_types(embedding=List[float])
def run(text: str)
Embed a string.