VectorStores#

Wrappers on top of vector stores.

class langchain.vectorstores.AtlasDB(name: str, embedding_function: Optional[langchain.embeddings.base.Embeddings] = None, api_key: Optional[str] = None, description: str = 'A description for your project', is_public: bool = True, reset_project_if_exists: bool = False)[source]#

Wrapper around Atlas: Nomic’s neural database and rhizomatic instrument.

To use, you should have the nomic python package installed.

Example

from langchain.vectorstores import AtlasDB
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = AtlasDB("my_project", embeddings.embed_query)

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, refresh: bool = True, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts (Iterable[str]) – Texts to add to the vectorstore.
metadatas (Optional[List[dict]], optional) – Optional list of metadatas.
ids (Optional[List[str]]) – An optional list of ids.
refresh (bool) – Whether or not to refresh indices with the updated data. Default True.

Returns

List of IDs of the added texts.

Return type

List[str]

create_index(**kwargs: Any) → Any[source]#

Creates an index in your project.

See https://docs.nomic.ai/atlas_api.html#nomic.project.AtlasProject.create_index for full detail.

classmethod from_documents(documents: List[langchain.schema.Document], embedding: Optional[langchain.embeddings.base.Embeddings] = None, ids: Optional[List[str]] = None, name: Optional[str] = None, api_key: Optional[str] = None, persist_directory: Optional[str] = None, description: str = 'A description for your project', is_public: bool = True, reset_project_if_exists: bool = False, index_kwargs: Optional[dict] = None, **kwargs: Any) → langchain.vectorstores.atlas.AtlasDB[source]#

Create an AtlasDB vectorstore from a list of documents.

Parameters

name (str) – Name of the collection to create.
api_key (str) – Your nomic API key,
documents (List[Document]) – List of documents to add to the vectorstore.
embedding (Optional[Embeddings]) – Embedding function. Defaults to None.
ids (Optional[List[str]]) – Optional list of document IDs. If None, ids will be auto created
description (str) – A description for your project.
is_public (bool) – Whether your project is publicly accessible. True by default.
reset_project_if_exists (bool) – Whether to reset this project if it already exists. Default False. Generally userful during development and testing.
index_kwargs (Optional[dict]) – Dict of kwargs for index creation. See https://docs.nomic.ai/atlas_api.html

Returns

Nomic’s neural database and finest rhizomatic instrument

Return type

AtlasDB

classmethod from_texts(texts: List[str], embedding: Optional[langchain.embeddings.base.Embeddings] = None, metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, name: Optional[str] = None, api_key: Optional[str] = None, description: str = 'A description for your project', is_public: bool = True, reset_project_if_exists: bool = False, index_kwargs: Optional[dict] = None, **kwargs: Any) → langchain.vectorstores.atlas.AtlasDB[source]#

Create an AtlasDB vectorstore from a raw documents.

Parameters

texts (List[str]) – The list of texts to ingest.
name (str) – Name of the project to create.
api_key (str) – Your nomic API key,
embedding (Optional[Embeddings]) – Embedding function. Defaults to None.
metadatas (Optional[List[dict]]) – List of metadatas. Defaults to None.
ids (Optional[List[str]]) – Optional list of document IDs. If None, ids will be auto created
description (str) – A description for your project.
is_public (bool) – Whether your project is publicly accessible. True by default.
reset_project_if_exists (bool) – Whether to reset this project if it already exists. Default False. Generally userful during development and testing.
index_kwargs (Optional[dict]) – Dict of kwargs for index creation. See https://docs.nomic.ai/atlas_api.html

Returns

Nomic’s neural database and finest rhizomatic instrument

Return type

AtlasDB

similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Run similarity search with AtlasDB

Parameters

query (str) – Query text to search for.
k (int) – Number of results to return. Defaults to 4.

Returns

List of documents most similar to the query text.

Return type

List[Document]

class langchain.vectorstores.Chroma(collection_name: str = 'langchain', embedding_function: Optional[Embeddings] = None, persist_directory: Optional[str] = None, client_settings: Optional[chromadb.config.Settings] = None, collection_metadata: Optional[Dict] = None)[source]#

Wrapper around ChromaDB embeddings platform.

To use, you should have the chromadb python package installed.

Example

from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = Chroma("langchain_store", embeddings.embed_query)

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts (Iterable[str]) – Texts to add to the vectorstore.
metadatas (Optional[List[dict]], optional) – Optional list of metadatas.
ids (Optional[List[str]], optional) – Optional list of IDs.

Returns

List of IDs of the added texts.

Return type

List[str]

delete_collection() → None[source]#: Delete the collection.

classmethod from_documents(documents: List[Document], embedding: Optional[Embeddings] = None, ids: Optional[List[str]] = None, collection_name: str = 'langchain', persist_directory: Optional[str] = None, client_settings: Optional[chromadb.config.Settings] = None, **kwargs: Any) → Chroma[source]#

Create a Chroma vectorstore from a list of documents.

If a persist_directory is specified, the collection will be persisted there. Otherwise, the data will be ephemeral in-memory.

Parameters

collection_name (str) – Name of the collection to create.
persist_directory (Optional[str]) – Directory to persist the collection.
ids (Optional[List[str]]) – List of document IDs. Defaults to None.
documents (List[Document]) – List of documents to add to the vectorstore.
embedding (Optional[Embeddings]) – Embedding function. Defaults to None.
client_settings (Optional[chromadb.config.Settings]) – Chroma client settings

Returns

Chroma vectorstore.

Return type

Chroma

classmethod from_texts(texts: List[str], embedding: Optional[Embeddings] = None, metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, collection_name: str = 'langchain', persist_directory: Optional[str] = None, client_settings: Optional[chromadb.config.Settings] = None, **kwargs: Any) → Chroma[source]#

Create a Chroma vectorstore from a raw documents.

If a persist_directory is specified, the collection will be persisted there. Otherwise, the data will be ephemeral in-memory.

Parameters

texts (List[str]) – List of texts to add to the collection.
collection_name (str) – Name of the collection to create.
persist_directory (Optional[str]) – Directory to persist the collection.
embedding (Optional[Embeddings]) – Embedding function. Defaults to None.
metadatas (Optional[List[dict]]) – List of metadatas. Defaults to None.
ids (Optional[List[str]]) – List of document IDs. Defaults to None.
client_settings (Optional[chromadb.config.Settings]) – Chroma client settings

Returns

Chroma vectorstore.

Return type

Chroma

max_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20, filter: Optional[Dict[str, str]] = None) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. :param query: Text to look up documents similar to. :param k: Number of Documents to return. Defaults to 4. :param fetch_k: Number of Documents to fetch to pass to MMR algorithm. :param filter: Filter by metadata. Defaults to None. :type filter: Optional[Dict[str, str]]

Returns: List of Documents selected by maximal marginal relevance.

max_marginal_relevance_search_by_vector(embedding: List[float], k: int = 4, fetch_k: int = 20, filter: Optional[Dict[str, str]] = None) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. :param embedding: Embedding to look up documents similar to. :param k: Number of Documents to return. Defaults to 4. :param fetch_k: Number of Documents to fetch to pass to MMR algorithm. :param filter: Filter by metadata. Defaults to None. :type filter: Optional[Dict[str, str]]

Returns: List of Documents selected by maximal marginal relevance.

persist() → None[source]#

Persist the collection.

This can be used to explicitly persist the data to disk. It will also be called automatically when the object is destroyed.

similarity_search(query: str, k: int = 4, filter: Optional[Dict[str, str]] = None, **kwargs: Any) → List[langchain.schema.Document][source]#

Run similarity search with Chroma.

Parameters

query (str) – Query text to search for.
k (int) – Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]) – Filter by metadata. Defaults to None.

Returns

List of documents most similar to the query text.

Return type

List[Document]

similarity_search_by_vector(embedding: List[float], k: int = 4, filter: Optional[Dict[str, str]] = None, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to embedding vector. :param embedding: Embedding to look up documents similar to. :param k: Number of Documents to return. Defaults to 4.

Returns: List of Documents most similar to the query vector.

similarity_search_with_score(query: str, k: int = 4, filter: Optional[Dict[str, str]] = None, **kwargs: Any) → List[Tuple[langchain.schema.Document, float]][source]#

Run similarity search with Chroma with distance.

Parameters

query (str) – Query text to search for.
k (int) – Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]) – Filter by metadata. Defaults to None.

Returns

List of documents most similar to the query: text with distance in float.

Return type

List[Tuple[Document, float]]

class langchain.vectorstores.DeepLake(dataset_path: str = 'mem://langchain', token: Optional[str] = None, embedding_function: Optional[langchain.embeddings.base.Embeddings] = None, read_only: Optional[bool] = False, ingestion_batch_size: int = 1024, num_workers: int = 4)[source]#

Wrapper around Deep Lake, a data lake for deep learning applications.

We implement naive similarity search and filtering for fast prototyping, but it can be extended with Tensor Query Language (TQL) for production use cases over billion rows.

Why Deep Lake?

Not only stores embeddings, but also the original data with version control.
Serverless, doesn’t require another service and can be used with major
cloud providers (S3, GCS, etc.)
More than just a multi-modal vector store. You can use the dataset
to fine-tune your own LLM models.

To use, you should have the deeplake python package installed.

Example

from langchain.vectorstores import DeepLake
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = DeepLake("langchain_store", embeddings.embed_query)

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts (Iterable[str]) – Texts to add to the vectorstore.
metadatas (Optional[List[dict]], optional) – Optional list of metadatas.
ids (Optional[List[str]], optional) – Optional list of IDs.

Returns

List of IDs of the added texts.

Return type

List[str]

delete(ids: Any[List[str], None] = None, filter: Any[Dict[str, str], None] = None, delete_all: Any[bool, None] = None) → bool[source]#

Delete the entities in the dataset

Parameters

ids (Optional[List[str]], optional) – The document_ids to delete. Defaults to None.
filter (Optional[Dict[str, str]], optional) – The filter to delete by. Defaults to None.
delete_all (Optional[bool], optional) – Whether to drop the dataset. Defaults to None.

delete_dataset() → None[source]#: Delete the collection.

classmethod from_texts(texts: List[str], embedding: Optional[langchain.embeddings.base.Embeddings] = None, metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, dataset_path: str = 'mem://langchain', **kwargs: Any) → langchain.vectorstores.deeplake.DeepLake[source]#

Create a Deep Lake dataset from a raw documents.

If a dataset_path is specified, the dataset will be persisted there. Otherwise, the data will be ephemeral in-memory.

Parameters

path (str, pathlib.Path) –
- The full path to the dataset. Can be:
- Deep Lake cloud path of the form hub://username/dataset_name.
  To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake (use ‘activeloop login’ from command line)
- AWS S3 path of the form s3://bucketname/path/to/dataset.
  Credentials are required in either the environment
- Google Cloud Storage path of the form
  ``gcs://bucketname/path/to/dataset``Credentials are required in either the environment
- Local file system path of the form ./path/to/dataset or
  ~/path/to/dataset or path/to/dataset.
- In-memory path of the form mem://path/to/dataset which doesn’t
  save the dataset, but keeps it in memory instead. Should be used only for testing as it does not persist.
documents (List[Document]) – List of documents to add.
embedding (Optional[Embeddings]) – Embedding function. Defaults to None.
metadatas (Optional[List[dict]]) – List of metadatas. Defaults to None.
ids (Optional[List[str]]) – List of document IDs. Defaults to None.

Returns

Deep Lake dataset.

Return type

DeepLake

max_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Returns: List of Documents selected by maximal marginal relevance.

max_marginal_relevance_search_by_vector(embedding: List[float], k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Returns: List of Documents selected by maximal marginal relevance.

persist() → None[source]#: Persist the collection.

search(query: Any[str, None] = None, embedding: Any[float, None] = None, k: int = 4, distance_metric: str = 'L2', use_maximal_marginal_relevance: Optional[bool] = False, fetch_k: Optional[int] = 20, filter: Optional[Any[Dict[str, str], Callable, str]] = None, return_score: Optional[bool] = False, **kwargs: Any) → Any[List[Document], List[Tuple[Document, float]]][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
embedding – Embedding function to use. Defaults to None.
k – Number of Documents to return. Defaults to 4.
distance_metric – L2 for Euclidean, L1 for Nuclear, max L-infinity distance, cos for cosine similarity, ‘dot’ for dot product. Defaults to L2.
filter – Attribute filter by metadata example {‘key’: ‘value’}. It can also
filter] (take [Deep Lake) –
(https – //docs.deeplake.ai/en/latest/deeplake.core.dataset.html#deeplake.core.dataset.Dataset.filter) Defaults to None.
maximal_marginal_relevance – Whether to use maximal marginal relevance. Defaults to False.
fetch_k – Number of Documents to fetch to pass to MMR algorithm. Defaults to 20.
return_score – Whether to return the score. Defaults to False.

Returns

List of Documents selected by the specified distance metric, if return_score True, return a tuple of (Document, score)

similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to query.

Parameters

query – text to embed and run the query on.
k – Number of Documents to return. Defaults to 4.
query – Text to look up documents similar to.
embedding – Embedding function to use. Defaults to None.
k – Number of Documents to return. Defaults to 4.
distance_metric – L2 for Euclidean, L1 for Nuclear, max L-infinity distance, cos for cosine similarity, ‘dot’ for dot product Defaults to L2.
filter – Attribute filter by metadata example {‘key’: ‘value’}. Defaults to None.
maximal_marginal_relevance – Whether to use maximal marginal relevance. Defaults to False.
fetch_k – Number of Documents to fetch to pass to MMR algorithm. Defaults to 20.
return_score – Whether to return the score. Defaults to False.

Returns

List of Documents most similar to the query vector.

similarity_search_by_vector(embedding: List[float], k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to embedding vector.

Parameters

embedding – Embedding to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query vector.

similarity_search_with_score(query: str, distance_metric: str = 'L2', k: int = 4, filter: Optional[Dict[str, str]] = None) → List[Tuple[langchain.schema.Document, float]][source]#

Run similarity search with Deep Lake with distance returned.

Parameters

query (str) – Query text to search for.
distance_metric – L2 for Euclidean, L1 for Nuclear, max L-infinity distance, cos for cosine similarity, ‘dot’ for dot product. Defaults to L2.
k (int) – Number of results to return. Defaults to 4.
filter (Optional[Dict[str, str]]) – Filter by metadata. Defaults to None.

Returns

List of documents most similar to the query: text with distance in float.

Return type

List[Tuple[Document, float]]

class langchain.vectorstores.ElasticVectorSearch(elasticsearch_url: str, index_name: str, embedding: langchain.embeddings.base.Embeddings)[source]#

Wrapper around Elasticsearch as a vector database.

To connect to an Elasticsearch instance that does not require login credentials, pass the Elasticsearch URL and index name along with the embedding object to the constructor.

Example

from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings

embedding = OpenAIEmbeddings()
elastic_vector_search = ElasticVectorSearch(
    elasticsearch_url="http://localhost:9200",
    index_name="test_index",
    embedding=embedding
)

To connect to an Elasticsearch instance that requires login credentials, including Elastic Cloud, use the Elasticsearch URL format https://username:password@es_host:9243. For example, to connect to Elastic Cloud, create the Elasticsearch URL with the required authentication details and pass it to the ElasticVectorSearch constructor as the named parameter elasticsearch_url.

You can obtain your Elastic Cloud URL and login credentials by logging in to the Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and navigating to the “Deployments” page.

To obtain your Elastic Cloud password for the default “elastic” user:

Log in to the Elastic Cloud console at https://cloud.elastic.co
Go to “Security” > “Users”
Locate the “elastic” user and click “Edit”
Click “Reset password”
Follow the prompts to reset the password

The format for Elastic Cloud URLs is https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.

Example

from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

elastic_host = "cluster_id.region_id.gcp.cloud.es.io"
elasticsearch_url = f"https://username:password@{elastic_host}:9243"
elastic_vector_search = ElasticVectorSearch(
    elasticsearch_url=elasticsearch_url,
    index_name="test_index",
    embedding=embedding
)

Parameters

elasticsearch_url (str) – The URL for the Elasticsearch instance.
index_name (str) – The name of the Elasticsearch index for the embeddings.
embedding (Embeddings) – An object that provides the ability to embed text. It should be an instance of a class that subclasses the Embeddings abstract base class, such as OpenAIEmbeddings()

Raises

ValueError – If the elasticsearch python package is not installed.

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, refresh_indices: bool = True, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts – Iterable of strings to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.
refresh_indices – bool to refresh ElasticSearch indices

Returns

List of ids from adding the texts into the vectorstore.

classmethod from_texts(texts: List[str], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any) → langchain.vectorstores.elastic_vector_search.ElasticVectorSearch[source]#

Construct ElasticVectorSearch wrapper from raw documents.

This is a user-friendly interface that:

Embeds documents.
Creates a new index for the embeddings in the Elasticsearch instance.
Adds the documents to the newly created Elasticsearch index.

This is intended to be a quick way to get started.

Example

from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
elastic_vector_search = ElasticVectorSearch.from_texts(
    texts,
    embeddings,
    elasticsearch_url="http://localhost:9200"
)

similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query.

class langchain.vectorstores.FAISS(embedding_function: Callable, index: Any, docstore: langchain.docstore.base.Docstore, index_to_docstore_id: Dict[int, str])[source]#

Wrapper around FAISS vector database.

To use, you should have the faiss python package installed.

Example

from langchain import FAISS
faiss = FAISS(embedding_function, index, docstore, index_to_docstore_id)

add_embeddings(text_embeddings: Iterable[Tuple[str, List[float]]], metadatas: Optional[List[dict]] = None, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

text_embeddings – Iterable pairs of string and embedding to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.

Returns

List of ids from adding the texts into the vectorstore.

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts – Iterable of strings to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.

Returns

List of ids from adding the texts into the vectorstore.

classmethod from_embeddings(text_embeddings: List[Tuple[str, List[float]]], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any) → langchain.vectorstores.faiss.FAISS[source]#

Construct FAISS wrapper from raw documents.

This is a user friendly interface that:

Embeds documents.
Creates an in memory docstore
Initializes the FAISS database

This is intended to be a quick way to get started.

Example

from langchain import FAISS
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
faiss = FAISS.from_texts(texts, embeddings)

Construct FAISS wrapper from raw documents.

This is a user friendly interface that:

Embeds documents.
Creates an in memory docstore
Initializes the FAISS database

This is intended to be a quick way to get started.

Example

from langchain import FAISS
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
faiss = FAISS.from_texts(texts, embeddings)

classmethod load_local(folder_path: str, embeddings: langchain.embeddings.base.Embeddings, index_name: str = 'index') → langchain.vectorstores.faiss.FAISS[source]#

Load FAISS index, docstore, and index_to_docstore_id to disk.

Parameters

folder_path – folder path to load index, docstore, and index_to_docstore_id from.
embeddings – Embeddings to use when generating queries
index_name – for saving with a specific index file name

max_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance.

Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
fetch_k – Number of Documents to fetch to pass to MMR algorithm.

Returns

List of Documents selected by maximal marginal relevance.

max_marginal_relevance_search_by_vector(embedding: List[float], k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance.

Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.

Parameters

embedding – Embedding to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
fetch_k – Number of Documents to fetch to pass to MMR algorithm.

Returns

List of Documents selected by maximal marginal relevance.

merge_from(target: langchain.vectorstores.faiss.FAISS) → None[source]#

Merge another FAISS object with the current one.

Add the target FAISS to the current one.

Parameters: target – FAISS object you wish to merge into the current one
Returns: None.

save_local(folder_path: str, index_name: str = 'index') → None[source]#

Save FAISS index, docstore, and index_to_docstore_id to disk.

Parameters

folder_path – folder path to save index, docstore, and index_to_docstore_id to.
index_name – for saving with a specific index file name

similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query.

similarity_search_by_vector(embedding: List[float], k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to embedding vector.

Parameters

embedding – Embedding to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the embedding.

similarity_search_with_score(query: str, k: int = 4) → List[Tuple[langchain.schema.Document, float]][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query and score for each

similarity_search_with_score_by_vector(embedding: List[float], k: int = 4) → List[Tuple[langchain.schema.Document, float]][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query and score for each

class langchain.vectorstores.Milvus(embedding_function: langchain.embeddings.base.Embeddings, connection_args: dict, collection_name: str, text_field: str)[source]#

Wrapper around the Milvus vector database.

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, partition_name: Optional[str] = None, timeout: Optional[int] = None, **kwargs: Any) → List[str][source]#

Insert text data into Milvus.

When using add_texts() it is assumed that a collecton has already been made and indexed. If metadata is included, it is assumed that it is ordered correctly to match the schema provided to the Collection and that the embedding vector is the first schema field.

Parameters

texts (Iterable[str]) – The text being embedded and inserted.
metadatas (Optional[List[dict]], optional) – The metadata that corresponds to each insert. Defaults to None.
partition_name (str, optional) – The partition of the collection to insert data into. Defaults to None.
timeout – specified timeout.

Returns

The resulting keys for each inserted element.

Return type

List[str]

Create a Milvus collection, indexes it with HNSW, and insert data.

Parameters

texts (List[str]) – Text to insert.
embedding (Embeddings) – Embedding function to use.
metadatas (Optional[List[dict]], optional) – Dict metatadata. Defaults to None.

Returns

The Milvus vector store.

Return type

VectorStore

max_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20, param: Optional[dict] = None, expr: Optional[str] = None, partition_names: Optional[List[str]] = None, round_decimal: int = - 1, timeout: Optional[int] = None, **kwargs: Any) → List[langchain.schema.Document][source]#

Perform a search and return results that are reordered by MMR.

Parameters

query (str) – The text being searched.
k (int, optional) – How many results to give. Defaults to 4.
fetch_k (int, optional) – Total results to select k from. Defaults to 20.
param (dict, optional) – The search params for the specified index. Defaults to None.
expr (str, optional) – Filtering expression. Defaults to None.
partition_names (List[str], optional) – What partitions to search. Defaults to None.
round_decimal (int, optional) – Round the resulting distance. Defaults to -1.
timeout (int, optional) – Amount to wait before timeout error. Defaults to None.

Returns

Document results for search.

Return type

List[Document]

similarity_search(query: str, k: int = 4, param: Optional[dict] = None, expr: Optional[str] = None, partition_names: Optional[List[str]] = None, round_decimal: int = - 1, timeout: Optional[int] = None, **kwargs: Any) → List[langchain.schema.Document][source]#

Perform a similarity search against the query string.

Parameters

query (str) – The text to search.
k (int, optional) – How many results to return. Defaults to 4.
param (dict, optional) – The search params for the index type. Defaults to None.
expr (str, optional) – Filtering expression. Defaults to None.
partition_names (List[str], optional) – What partitions to search. Defaults to None.
round_decimal (int, optional) – What decimal point to round to. Defaults to -1.
timeout (int, optional) – How long to wait before timeout error. Defaults to None.

Returns

Document results for search.

Return type

List[Document]

similarity_search_with_score(query: str, k: int = 4, param: Optional[dict] = None, expr: Optional[str] = None, partition_names: Optional[List[str]] = None, round_decimal: int = - 1, timeout: Optional[int] = None, **kwargs: Any) → List[Tuple[langchain.schema.Document, float]][source]#

Perform a search on a query string and return results.

Parameters

query (str) – The text being searched.
k (int, optional) – The amount of results ot return. Defaults to 4.
param (dict, optional) – The search params for the specified index. Defaults to None.
expr (str, optional) – Filtering expression. Defaults to None.
partition_names (List[str], optional) – Partitions to search through. Defaults to None.
round_decimal (int, optional) – Round the resulting distance. Defaults to -1.
timeout (int, optional) – Amount to wait before timeout error. Defaults to None.
kwargs – Collection.search() keyword arguments.

Returns

search_embedding,: (Document, distance, primary_field) results.

Return type

List[float], List[Tuple[Document, any, any]]

class langchain.vectorstores.OpenSearchVectorSearch(opensearch_url: str, index_name: str, embedding_function: langchain.embeddings.base.Embeddings, **kwargs: Any)[source]#

Wrapper around OpenSearch as a vector database.

Example

from langchain import OpenSearchVectorSearch
opensearch_vector_search = OpenSearchVectorSearch(
    "http://localhost:9200",
    "embeddings",
    embedding_function
)

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, bulk_size: int = 500, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts – Iterable of strings to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.
bulk_size – Bulk API request count; Default: 500

Returns

List of ids from adding the texts into the vectorstore.

Optional Args:

vector_field: Document field embeddings are stored in. Defaults to “vector_field”.

text_field: Document field the text of the document is stored in. Defaults to “text”.

classmethod from_texts(texts: List[str], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, bulk_size: int = 500, **kwargs: Any) → langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch[source]#

Construct OpenSearchVectorSearch wrapper from raw documents.

Example

from langchain import OpenSearchVectorSearch
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
opensearch_vector_search = OpenSearchVectorSearch.from_texts(
    texts,
    embeddings,
    opensearch_url="http://localhost:9200"
)

OpenSearch by default supports Approximate Search powered by nmslib, faiss and lucene engines recommended for large datasets. Also supports brute force search through Script Scoring and Painless Scripting.

Optional Args:

vector_field: Document field embeddings are stored in. Defaults to “vector_field”.

text_field: Document field the text of the document is stored in. Defaults to “text”.

Optional Keyword Args for Approximate Search:

engine: “nmslib”, “faiss”, “hnsw”; default: “nmslib”

space_type: “l2”, “l1”, “cosinesimil”, “linf”, “innerproduct”; default: “l2”

ef_search: Size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches; default: 512

ef_construction: Size of the dynamic list used during k-NN graph creation. Higher values lead to more accurate graph but slower indexing speed; default: 512

m: Number of bidirectional links created for each new element. Large impact on memory consumption. Between 2 and 100; default: 16

Keyword Args for Script Scoring or Painless Scripting:

is_appx_search: False

similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to query.

By default supports Approximate Search. Also supports Script Scoring and Painless Scripting.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query.

Optional Args:

vector_field: Document field embeddings are stored in. Defaults to “vector_field”.

text_field: Document field the text of the document is stored in. Defaults to “text”.

metadata_field: Document field that metadata is stored in. Defaults to “metadata”. Can be set to a special value “*” to include the entire document.

Optional Args for Approximate Search:

search_type: “approximate_search”; default: “approximate_search”

size: number of results the query actually returns; default: 4

Optional Args for Script Scoring Search:

search_type: “script_scoring”; default: “approximate_search”

space_type: “l2”, “l1”, “linf”, “cosinesimil”, “innerproduct”, “hammingbit”; default: “l2”

pre_filter: script_score query to pre-filter documents before identifying nearest neighbors; default: {“match_all”: {}}

Optional Args for Painless Scripting Search:

search_type: “painless_scripting”; default: “approximate_search”

space_type: “l2Squared”, “l1Norm”, “cosineSimilarity”; default: “l2Squared”

pre_filter: script_score query to pre-filter documents before identifying nearest neighbors; default: {“match_all”: {}}

class langchain.vectorstores.Pinecone(index: Any, embedding_function: Callable, text_key: str, namespace: Optional[str] = None)[source]#

Wrapper around Pinecone vector database.

To use, you should have the pinecone-client python package installed.

Example

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

# The environment should be the one specified next to the API key
# in your Pinecone console
pinecone.init(api_key="***", environment="...")
index = pinecone.Index("langchain-demo")
embeddings = OpenAIEmbeddings()
vectorstore = Pinecone(index, embeddings.embed_query, "text")

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, namespace: Optional[str] = None, batch_size: int = 32, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts – Iterable of strings to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.
ids – Optional list of ids to associate with the texts.
namespace – Optional pinecone namespace to add the texts to.

Returns

List of ids from adding the texts into the vectorstore.

classmethod from_existing_index(index_name: str, embedding: langchain.embeddings.base.Embeddings, text_key: str = 'text', namespace: Optional[str] = None) → langchain.vectorstores.pinecone.Pinecone[source]#: Load pinecone vectorstore from index name.

classmethod from_texts(texts: List[str], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, batch_size: int = 32, text_key: str = 'text', index_name: Optional[str] = None, namespace: Optional[str] = None, **kwargs: Any) → langchain.vectorstores.pinecone.Pinecone[source]#

Construct Pinecone wrapper from raw documents.

This is a user friendly interface that:

Embeds documents.
Adds the documents to a provided Pinecone index

This is intended to be a quick way to get started.

Example

from langchain import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone

# The environment should be the one specified next to the API key
# in your Pinecone console
pinecone.init(api_key="***", environment="...")
embeddings = OpenAIEmbeddings()
pinecone = Pinecone.from_texts(
    texts,
    embeddings,
    index_name="langchain-demo"
)

similarity_search(query: str, k: int = 4, filter: Optional[dict] = None, namespace: Optional[str] = None, **kwargs: Any) → List[langchain.schema.Document][source]#

Return pinecone documents most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
filter – Dictionary of argument(s) to filter on metadata
namespace – Namespace to search in. Default will search in ‘’ namespace.

Returns

List of Documents most similar to the query and score for each

similarity_search_with_score(query: str, k: int = 4, filter: Optional[dict] = None, namespace: Optional[str] = None) → List[Tuple[langchain.schema.Document, float]][source]#

Return pinecone documents most similar to query, along with scores.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
filter – Dictionary of argument(s) to filter on metadata
namespace – Namespace to search in. Default will search in ‘’ namespace.

Returns

List of Documents most similar to the query and score for each

class langchain.vectorstores.Qdrant(client: Any, collection_name: str, embedding_function: Callable, content_payload_key: str = 'page_content', metadata_payload_key: str = 'metadata')[source]#

Wrapper around Qdrant vector database.

To use you should have the qdrant-client package installed.

Example

from qdrant_client import QdrantClient
from langchain import Qdrant

client = QdrantClient()
collection_name = "MyCollection"
qdrant = Qdrant(client, collection_name, embedding_function)

CONTENT_KEY = 'page_content'#

METADATA_KEY = 'metadata'#

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts – Iterable of strings to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.

Returns

List of ids from adding the texts into the vectorstore.

classmethod from_texts(texts: List[str], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, location: Optional[str] = None, url: Optional[str] = None, port: Optional[int] = 6333, grpc_port: int = 6334, prefer_grpc: bool = False, https: Optional[bool] = None, api_key: Optional[str] = None, prefix: Optional[str] = None, timeout: Optional[float] = None, host: Optional[str] = None, path: Optional[str] = None, collection_name: Optional[str] = None, distance_func: str = 'Cosine', content_payload_key: str = 'page_content', metadata_payload_key: str = 'metadata', **kwargs: Any) → langchain.vectorstores.qdrant.Qdrant[source]#

Construct Qdrant wrapper from a list of texts.

Parameters

texts – A list of texts to be indexed in Qdrant.
embedding – A subclass of Embeddings, responsible for text vectorization.
metadatas – An optional list of metadata. If provided it has to be of the same length as a list of texts.
location – If :memory: - use in-memory Qdrant instance. If str - use it as a url parameter. If None - fallback to relying on host and port parameters.
url – either host or str of “Optional[scheme], host, Optional[port], Optional[prefix]”. Default: None
port – Port of the REST API interface. Default: 6333
grpc_port – Port of the gRPC interface. Default: 6334
prefer_grpc – If true - use gPRC interface whenever possible in custom methods. Default: False
https – If true - use HTTPS(SSL) protocol. Default: None
api_key – API key for authentication in Qdrant Cloud. Default: None
prefix –
If not None - add prefix to the REST URL path. Example: service/v1 will result in

http://localhost:6333/service/v1/{qdrant-endpoint} for REST API.

Default: None
timeout – Timeout for REST and gRPC API requests. Default: 5.0 seconds for REST and unlimited for gRPC
host – Host name of Qdrant service. If url and host are None, set to ‘localhost’. Default: None
path – Path in which the vectors will be stored while using local mode. Default: None
collection_name – Name of the Qdrant collection to be used. If not provided, it will be created randomly. Default: None
distance_func – Distance function. One of: “Cosine” / “Euclid” / “Dot”. Default: “Cosine”
content_payload_key – A payload key used to store the content of the document. Default: “page_content”
metadata_payload_key – A payload key used to store the metadata of the document. Default: “metadata”
**kwargs – Additional arguments passed directly into REST client initialization

This is a user friendly interface that:

Creates embeddings, one for each text
Initializes the Qdrant database as an in-memory docstore by default (and overridable to a remote docstore)
Adds the text embeddings to the Qdrant database

This is intended to be a quick way to get started.

Example

from langchain import Qdrant
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
qdrant = Qdrant.from_texts(texts, embeddings, "localhost")

max_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance.

Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
fetch_k – Number of Documents to fetch to pass to MMR algorithm. Defaults to 20.

Returns

List of Documents selected by maximal marginal relevance.

similarity_search(query: str, k: int = 4, filter: Optional[Dict[str, Union[str, int, bool]]] = None, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
filter – Filter by metadata. Defaults to None.

Returns

List of Documents most similar to the query.

similarity_search_with_score(query: str, k: int = 4, filter: Optional[Dict[str, Union[str, int, bool]]] = None) → List[Tuple[langchain.schema.Document, float]][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
filter – Filter by metadata. Defaults to None.

Returns

List of Documents most similar to the query and score for each.

class langchain.vectorstores.VectorStore[source]#

Interface for vector stores.

async aadd_documents(documents: List[langchain.schema.Document], **kwargs: Any) → List[str][source]#

Run more documents through the embeddings and add to the vectorstore.

Parameters: (List[Document] (documents) – Documents to add to the vectorstore.
Returns: List of IDs of the added texts.
Return type: List[str]

async aadd_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) → List[str][source]#: Run more texts through the embeddings and add to the vectorstore.

add_documents(documents: List[langchain.schema.Document], **kwargs: Any) → List[str][source]#

Run more documents through the embeddings and add to the vectorstore.

Parameters: (List[Document] (documents) – Documents to add to the vectorstore.
Returns: List of IDs of the added texts.
Return type: List[str]

abstract add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) → List[str][source]#

Run more texts through the embeddings and add to the vectorstore.

Parameters

texts – Iterable of strings to add to the vectorstore.
metadatas – Optional list of metadatas associated with the texts.
kwargs – vectorstore specific parameters

Returns

List of ids from adding the texts into the vectorstore.

async classmethod afrom_documents(documents: List[langchain.schema.Document], embedding: langchain.embeddings.base.Embeddings, **kwargs: Any) → langchain.vectorstores.base.VST[source]#: Return VectorStore initialized from documents and embeddings.

async classmethod afrom_texts(texts: List[str], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any) → langchain.vectorstores.base.VST[source]#: Return VectorStore initialized from texts and embeddings.

async amax_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#: Return docs selected using the maximal marginal relevance.

async amax_marginal_relevance_search_by_vector(embedding: List[float], k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#: Return docs selected using the maximal marginal relevance.

as_retriever(**kwargs: Any) → langchain.schema.BaseRetriever[source]#

async asimilarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#: Return docs most similar to query.

async asimilarity_search_by_vector(embedding: List[float], k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#: Return docs most similar to embedding vector.

classmethod from_documents(documents: List[langchain.schema.Document], embedding: langchain.embeddings.base.Embeddings, **kwargs: Any) → langchain.vectorstores.base.VST[source]#: Return VectorStore initialized from documents and embeddings.

abstract classmethod from_texts(texts: List[str], embedding: langchain.embeddings.base.Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any) → langchain.vectorstores.base.VST[source]#: Return VectorStore initialized from texts and embeddings.

max_marginal_relevance_search(query: str, k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance.

Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
fetch_k – Number of Documents to fetch to pass to MMR algorithm.

Returns

List of Documents selected by maximal marginal relevance.

max_marginal_relevance_search_by_vector(embedding: List[float], k: int = 4, fetch_k: int = 20) → List[langchain.schema.Document][source]#

Return docs selected using the maximal marginal relevance.

Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents.

Parameters

embedding – Embedding to look up documents similar to.
k – Number of Documents to return. Defaults to 4.
fetch_k – Number of Documents to fetch to pass to MMR algorithm.

Returns

List of Documents selected by maximal marginal relevance.

abstract similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#: Return docs most similar to query.

similarity_search_by_vector(embedding: List[float], k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to embedding vector.

Parameters

embedding – Embedding to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query vector.

class langchain.vectorstores.Weaviate(client: Any, index_name: str, text_key: str, attributes: Optional[List[str]] = None)[source]#

Wrapper around Weaviate vector database.

To use, you should have the weaviate-client python package installed.

Example

import weaviate
from langchain.vectorstores import Weaviate
client = weaviate.Client(url=os.environ["WEAVIATE_URL"], ...)
weaviate = Weaviate(client, index_name, text_key)

add_texts(texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any) → List[str][source]#: Upload texts with metadata (properties) to Weaviate.

Construct Weaviate wrapper from raw documents.

This is a user-friendly interface that:

Embeds documents.
Creates a new index for the embeddings in the Weaviate instance.
Adds the documents to the newly created Weaviate index.

This is intended to be a quick way to get started.

Example

from langchain.vectorstores.weaviate import Weaviate
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
weaviate = Weaviate.from_texts(
    texts,
    embeddings,
    weaviate_url="http://localhost:8080"
)

similarity_search(query: str, k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#

Return docs most similar to query.

Parameters

query – Text to look up documents similar to.
k – Number of Documents to return. Defaults to 4.

Returns

List of Documents most similar to the query.

similarity_search_by_vector(embedding: List[float], k: int = 4, **kwargs: Any) → List[langchain.schema.Document][source]#: Look up similar documents by embedding vector in Weaviate.