Añadiendo todos los archivos del proyecto (incluidos secretos y venv)
This commit is contained in:
43
venv/lib/python3.12/site-packages/mysql/ai/genai/__init__.py
Normal file
43
venv/lib/python3.12/site-packages/mysql/ai/genai/__init__.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# Copyright (c) 2025 Oracle and/or its affiliates.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License, version 2.0, as
|
||||
# published by the Free Software Foundation.
|
||||
#
|
||||
# This program is designed to work with certain software (including
|
||||
# but not limited to OpenSSL) that is licensed under separate terms,
|
||||
# as designated in a particular file or component or in included license
|
||||
# documentation. The authors of MySQL hereby grant you an
|
||||
# additional permission to link the program and your derivative works
|
||||
# with the separately licensed software that they have either included with
|
||||
# the program or referenced in the documentation.
|
||||
#
|
||||
# Without limiting anything contained in the foregoing, this file,
|
||||
# which is part of MySQL Connector/Python, is also subject to the
|
||||
# Universal FOSS Exception, version 1.0, a copy of which can be found at
|
||||
# http://oss.oracle.com/licenses/universal-foss-exception.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# See the GNU General Public License, version 2.0, for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
"""GenAI package for MySQL Connector/Python.
|
||||
|
||||
Performs optional dependency checks and exposes public classes:
|
||||
- MyEmbeddings
|
||||
- MyLLM
|
||||
- MyVectorStore
|
||||
"""
|
||||
from mysql.ai.utils import check_dependencies as _check_dependencies
|
||||
|
||||
_check_dependencies(["GENAI"])
|
||||
del _check_dependencies
|
||||
|
||||
from .embedding import MyEmbeddings
|
||||
from .generation import MyLLM
|
||||
from .vector_store import MyVectorStore
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
197
venv/lib/python3.12/site-packages/mysql/ai/genai/embedding.py
Normal file
197
venv/lib/python3.12/site-packages/mysql/ai/genai/embedding.py
Normal file
@@ -0,0 +1,197 @@
|
||||
# Copyright (c) 2025 Oracle and/or its affiliates.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License, version 2.0, as
|
||||
# published by the Free Software Foundation.
|
||||
#
|
||||
# This program is designed to work with certain software (including
|
||||
# but not limited to OpenSSL) that is licensed under separate terms,
|
||||
# as designated in a particular file or component or in included license
|
||||
# documentation. The authors of MySQL hereby grant you an
|
||||
# additional permission to link the program and your derivative works
|
||||
# with the separately licensed software that they have either included with
|
||||
# the program or referenced in the documentation.
|
||||
#
|
||||
# Without limiting anything contained in the foregoing, this file,
|
||||
# which is part of MySQL Connector/Python, is also subject to the
|
||||
# Universal FOSS Exception, version 1.0, a copy of which can be found at
|
||||
# http://oss.oracle.com/licenses/universal-foss-exception.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# See the GNU General Public License, version 2.0, for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
"""Embeddings integration utilities for MySQL Connector/Python.
|
||||
|
||||
Provides MyEmbeddings class to generate embeddings via MySQL HeatWave
|
||||
using ML_EMBED_TABLE and ML_EMBED_ROW.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from pydantic import PrivateAttr
|
||||
|
||||
from mysql.ai.utils import (
|
||||
atomic_transaction,
|
||||
execute_sql,
|
||||
format_value_sql,
|
||||
source_schema,
|
||||
sql_table_from_df,
|
||||
sql_table_to_df,
|
||||
temporary_sql_tables,
|
||||
)
|
||||
from mysql.connector.abstracts import MySQLConnectionAbstract
|
||||
|
||||
|
||||
class MyEmbeddings(Embeddings):
|
||||
"""
|
||||
Embedding generator class that uses a MySQL database to compute embeddings for input text.
|
||||
|
||||
This class batches input text into temporary SQL tables, invokes MySQL's ML_EMBED_TABLE
|
||||
to generate embeddings, and retrieves the results as lists of floats.
|
||||
|
||||
Attributes:
|
||||
_db_connection (MySQLConnectionAbstract): MySQL connection used for all database operations.
|
||||
schema_name (str): Name of the database schema to use.
|
||||
options_placeholder (str): SQL-ready placeholder string for ML_EMBED_TABLE options.
|
||||
options_params (dict): Dictionary of concrete option values to be passed as SQL parameters.
|
||||
"""
|
||||
|
||||
_db_connection: MySQLConnectionAbstract = PrivateAttr()
|
||||
|
||||
def __init__(
|
||||
self, db_connection: MySQLConnectionAbstract, options: Optional[Dict] = None
|
||||
):
|
||||
"""
|
||||
Initialize MyEmbeddings with a database connection and optional embedding parameters.
|
||||
|
||||
References:
|
||||
https://dev.mysql.com/doc/heatwave/en/mys-hwgenai-ml-embed-row.html
|
||||
A full list of supported options can be found under "options"
|
||||
|
||||
NOTE: The supported "options" are the intersection of the options provided in
|
||||
https://dev.mysql.com/doc/heatwave/en/mys-hwgenai-ml-embed-row.html
|
||||
https://dev.mysql.com/doc/heatwave/en/mys-hwgenai-ml-embed-table.html
|
||||
|
||||
Args:
|
||||
db_connection: Active MySQL connector database connection.
|
||||
options: Optional dictionary of options for embedding operations.
|
||||
|
||||
Raises:
|
||||
ValueError: If the schema name is not valid
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
"""
|
||||
super().__init__()
|
||||
self._db_connection = db_connection
|
||||
self.schema_name = source_schema(db_connection)
|
||||
options = options or {}
|
||||
self.options_placeholder, self.options_params = format_value_sql(options)
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""
|
||||
Generate embeddings for a list of input texts using the MySQL ML embedding procedure.
|
||||
|
||||
References:
|
||||
https://dev.mysql.com/doc/heatwave/en/mys-hwgenai-ml-embed-table.html
|
||||
|
||||
Args:
|
||||
texts: List of input strings to embed.
|
||||
|
||||
Returns:
|
||||
List of lists of floats, with each inner list containing the embedding for a text.
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If provided options are invalid or unsupported.
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
ValueError:
|
||||
If one or more text entries were unable to be embedded.
|
||||
|
||||
Implementation notes:
|
||||
- Creates a temporary table to pass input text to the MySQL embedding service.
|
||||
- Adds a primary key to ensure results preserve input order.
|
||||
- Calls ML_EMBED_TABLE and fetches the resulting embeddings.
|
||||
- Deletes the temporary table after use to avoid polluting the database.
|
||||
- Embedding vectors are extracted from the "embeddings" column of the result table.
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
df = pd.DataFrame({"id": range(len(texts)), "text": texts})
|
||||
|
||||
with (
|
||||
atomic_transaction(self._db_connection) as cursor,
|
||||
temporary_sql_tables(self._db_connection) as temporary_tables,
|
||||
):
|
||||
qualified_table_name, table_name = sql_table_from_df(
|
||||
cursor, self.schema_name, df
|
||||
)
|
||||
temporary_tables.append((self.schema_name, table_name))
|
||||
|
||||
# ML_EMBED_TABLE expects input/output columns and options as parameters
|
||||
embed_query = (
|
||||
"CALL sys.ML_EMBED_TABLE("
|
||||
f"'{qualified_table_name}.text', "
|
||||
f"'{qualified_table_name}.embeddings', "
|
||||
f"{self.options_placeholder}"
|
||||
")"
|
||||
)
|
||||
execute_sql(cursor, embed_query, params=self.options_params)
|
||||
|
||||
# Read back all columns, including "embeddings"
|
||||
df_embeddings = sql_table_to_df(cursor, self.schema_name, table_name)
|
||||
|
||||
if df_embeddings["embeddings"].isnull().any() or any(
|
||||
e is None for e in df_embeddings["embeddings"]
|
||||
):
|
||||
raise ValueError(
|
||||
"Failure to generate embeddings for one or more text entry."
|
||||
)
|
||||
|
||||
# Convert fetched embeddings to lists of floats
|
||||
embeddings = df_embeddings["embeddings"].tolist()
|
||||
embeddings = [list(e) for e in embeddings]
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""
|
||||
Generate an embedding for a single text string.
|
||||
|
||||
References:
|
||||
https://dev.mysql.com/doc/heatwave/en/mys-hwgenai-ml-embed-row.html
|
||||
|
||||
Args:
|
||||
text: The input string to embed.
|
||||
|
||||
Returns:
|
||||
List of floats representing the embedding vector.
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If provided options are invalid or unsupported.
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Example:
|
||||
>>> MyEmbeddings(db_conn).embed_query("Hello world")
|
||||
[0.1, 0.2, ...]
|
||||
"""
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
execute_sql(
|
||||
cursor,
|
||||
f'SELECT sys.ML_EMBED_ROW("%s", {self.options_placeholder})',
|
||||
params=(text, *self.options_params),
|
||||
)
|
||||
return list(cursor.fetchone()[0])
|
||||
162
venv/lib/python3.12/site-packages/mysql/ai/genai/generation.py
Normal file
162
venv/lib/python3.12/site-packages/mysql/ai/genai/generation.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# Copyright (c) 2025 Oracle and/or its affiliates.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License, version 2.0, as
|
||||
# published by the Free Software Foundation.
|
||||
#
|
||||
# This program is designed to work with certain software (including
|
||||
# but not limited to OpenSSL) that is licensed under separate terms,
|
||||
# as designated in a particular file or component or in included license
|
||||
# documentation. The authors of MySQL hereby grant you an
|
||||
# additional permission to link the program and your derivative works
|
||||
# with the separately licensed software that they have either included with
|
||||
# the program or referenced in the documentation.
|
||||
#
|
||||
# Without limiting anything contained in the foregoing, this file,
|
||||
# which is part of MySQL Connector/Python, is also subject to the
|
||||
# Universal FOSS Exception, version 1.0, a copy of which can be found at
|
||||
# http://oss.oracle.com/licenses/universal-foss-exception.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# See the GNU General Public License, version 2.0, for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
"""GenAI LLM integration utilities for MySQL Connector/Python.
|
||||
|
||||
Provides MyLLM wrapper that issues ML_GENERATE calls via SQL.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from typing import Any, List, Optional
|
||||
|
||||
try:
|
||||
from langchain_core.language_models.llms import LLM
|
||||
except ImportError:
|
||||
from langchain.llms.base import LLM
|
||||
from pydantic import PrivateAttr
|
||||
|
||||
from mysql.ai.utils import atomic_transaction, execute_sql, format_value_sql
|
||||
from mysql.connector.abstracts import MySQLConnectionAbstract
|
||||
|
||||
|
||||
class MyLLM(LLM):
|
||||
"""
|
||||
Custom Large Language Model (LLM) interface for MySQL HeatWave.
|
||||
|
||||
This class wraps the generation functionality provided by HeatWave LLMs,
|
||||
exposing an interface compatible with common LLM APIs for text generation.
|
||||
It provides full support for generative queries and limited support for
|
||||
agentic queries.
|
||||
|
||||
Attributes:
|
||||
_db_connection (MySQLConnectionAbstract):
|
||||
Underlying MySQL connector database connection.
|
||||
"""
|
||||
|
||||
_db_connection: MySQLConnectionAbstract = PrivateAttr()
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Pydantic config for the model.
|
||||
|
||||
By default, LangChain (through Pydantic BaseModel) does not allow
|
||||
setting or storing undeclared attributes such as _db_connection.
|
||||
Setting extra = "allow" makes it possible to store extra attributes
|
||||
on the class instance, which is required for MyLLM.
|
||||
"""
|
||||
|
||||
extra = "allow"
|
||||
|
||||
def __init__(self, db_connection: MySQLConnectionAbstract):
|
||||
"""
|
||||
Initialize the MyLLM instance with an active MySQL database connection.
|
||||
|
||||
Args:
|
||||
db_connection: A MySQL connection object used to run LLM queries.
|
||||
|
||||
Notes:
|
||||
The db_connection is stored as a private attribute via object.__setattr__,
|
||||
which is compatible with Pydantic models.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._db_connection = db_connection
|
||||
|
||||
def _call(
|
||||
self,
|
||||
prompt: str,
|
||||
stop: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""
|
||||
Generate a text completion from the LLM for a given input prompt.
|
||||
|
||||
References:
|
||||
https://dev.mysql.com/doc/heatwave/en/mys-hwgenai-ml-generate.html
|
||||
A full list of supported options (specified by kwargs) can be found under "options"
|
||||
|
||||
Args:
|
||||
prompt: The input prompt string for the language model.
|
||||
stop: Optional list of stop strings to support agentic and chain-of-thought
|
||||
reasoning workflows.
|
||||
**kwargs: Additional keyword arguments providing generation options to
|
||||
the LLM (these are serialized to JSON and passed to the HeatWave syscall).
|
||||
|
||||
Returns:
|
||||
The generated model output as a string.
|
||||
(The actual completion does NOT include the input prompt.)
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If provided options are invalid or unsupported.
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Implementation Notes:
|
||||
- Serializes kwargs into a SQL-compatible JSON string.
|
||||
- Calls the LLM stored procedure using a database cursor context.
|
||||
- Uses `sys.ML_GENERATE` on the server to produce the model output.
|
||||
- Expects the server response to be a JSON object with a 'text' key.
|
||||
"""
|
||||
options = kwargs.copy()
|
||||
if stop is not None:
|
||||
options["stop_sequences"] = stop
|
||||
|
||||
options_placeholder, options_params = format_value_sql(options)
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
# The prompt is passed as a parameterized argument (avoids SQL injection).
|
||||
generate_query = f"""SELECT sys.ML_GENERATE("%s", {options_placeholder});"""
|
||||
execute_sql(cursor, generate_query, params=(prompt, *options_params))
|
||||
# Expect a JSON-encoded result from MySQL; parse to extract the output.
|
||||
llm_response = json.loads(cursor.fetchone()[0])["text"]
|
||||
|
||||
return llm_response
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> dict:
|
||||
"""
|
||||
Return a dictionary of params that uniquely identify this LLM instance.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of identifier parameters (should include
|
||||
model_name for tracing/caching).
|
||||
"""
|
||||
return {
|
||||
"model_name": "mysql_heatwave_llm",
|
||||
}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""
|
||||
Get the type name of this LLM implementation.
|
||||
|
||||
Returns:
|
||||
A string identifying the LLM provider (used for logging or metrics).
|
||||
"""
|
||||
return "mysql_heatwave_llm"
|
||||
520
venv/lib/python3.12/site-packages/mysql/ai/genai/vector_store.py
Normal file
520
venv/lib/python3.12/site-packages/mysql/ai/genai/vector_store.py
Normal file
@@ -0,0 +1,520 @@
|
||||
# Copyright (c) 2025 Oracle and/or its affiliates.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License, version 2.0, as
|
||||
# published by the Free Software Foundation.
|
||||
#
|
||||
# This program is designed to work with certain software (including
|
||||
# but not limited to OpenSSL) that is licensed under separate terms,
|
||||
# as designated in a particular file or component or in included license
|
||||
# documentation. The authors of MySQL hereby grant you an
|
||||
# additional permission to link the program and your derivative works
|
||||
# with the separately licensed software that they have either included with
|
||||
# the program or referenced in the documentation.
|
||||
#
|
||||
# Without limiting anything contained in the foregoing, this file,
|
||||
# which is part of MySQL Connector/Python, is also subject to the
|
||||
# Universal FOSS Exception, version 1.0, a copy of which can be found at
|
||||
# http://oss.oracle.com/licenses/universal-foss-exception.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
# See the GNU General Public License, version 2.0, for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
"""MySQL-backed vector store for embeddings and semantic document retrieval.
|
||||
|
||||
Provides a VectorStore implementation persisting documents, metadata, and
|
||||
embeddings in MySQL, plus similarity search utilities.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from typing import Any, Iterable, List, Optional, Sequence, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from pydantic import PrivateAttr
|
||||
|
||||
from mysql.ai.genai.embedding import MyEmbeddings
|
||||
from mysql.ai.utils import (
|
||||
VAR_NAME_SPACE,
|
||||
atomic_transaction,
|
||||
delete_sql_table,
|
||||
execute_sql,
|
||||
extend_sql_table,
|
||||
format_value_sql,
|
||||
get_random_name,
|
||||
is_table_empty,
|
||||
source_schema,
|
||||
table_exists,
|
||||
)
|
||||
from mysql.connector.abstracts import MySQLConnectionAbstract
|
||||
|
||||
BASIC_EMBEDDING_QUERY = "Hello world!"
|
||||
EMBEDDING_SOURCE = "external_source"
|
||||
|
||||
VAR_EMBEDDING = f"{VAR_NAME_SPACE}.embedding"
|
||||
VAR_CONTEXT = f"{VAR_NAME_SPACE}.context"
|
||||
VAR_CONTEXT_MAP = f"{VAR_NAME_SPACE}.context_map"
|
||||
VAR_RETRIEVAL_INFO = f"{VAR_NAME_SPACE}.retrieval_info"
|
||||
VAR_OPTIONS = f"{VAR_NAME_SPACE}.options"
|
||||
|
||||
ID_SPACE = "internal_ai_id_"
|
||||
|
||||
|
||||
class MyVectorStore(VectorStore):
|
||||
"""
|
||||
MySQL-backed vector store for handling embeddings and semantic document retrieval.
|
||||
|
||||
Supports adding, deleting, and searching high-dimensional vector representations
|
||||
of documents using efficient storage and HeatWave ML similarity search procedures.
|
||||
|
||||
Supports use as a context manager: when used in a `with` statement, all backing
|
||||
tables/data are deleted automatically when the block exits (even on exception).
|
||||
|
||||
Attributes:
|
||||
db_connection (MySQLConnectionAbstract): Active MySQL database connection.
|
||||
embedder (Embeddings): Embeddings generator for computing vector representations.
|
||||
schema_name (str): SQL schema for table storage.
|
||||
table_name (Optional[str]): Name of the active table backing the store
|
||||
(or None until created).
|
||||
embedding_dimension (int): Size of embedding vectors stored.
|
||||
next_id (int): Internal counter for unique document ID generation.
|
||||
"""
|
||||
|
||||
_db_connection: MySQLConnectionAbstract = PrivateAttr()
|
||||
_embedder: Embeddings = PrivateAttr()
|
||||
_schema_name: str = PrivateAttr()
|
||||
_table_name: Optional[str] = PrivateAttr()
|
||||
_embedding_dimension: int = PrivateAttr()
|
||||
_next_id: int = PrivateAttr()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_connection: MySQLConnectionAbstract,
|
||||
embedder: Optional[Embeddings] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize a MyVectorStore with a database connection and embedding generator.
|
||||
|
||||
Args:
|
||||
db_connection: MySQL database connection for all vector operations.
|
||||
embedder: Embeddings generator used for creating and querying embeddings.
|
||||
|
||||
Raises:
|
||||
ValueError: If the schema name is not valid
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
"""
|
||||
super().__init__()
|
||||
self._next_id = 0
|
||||
|
||||
self._schema_name = source_schema(db_connection)
|
||||
self._embedder = embedder or MyEmbeddings(db_connection)
|
||||
self._db_connection = db_connection
|
||||
self._table_name: Optional[str] = None
|
||||
|
||||
# Embedding dimension determined using an example call.
|
||||
# Assumes embeddings have fixed length.
|
||||
self._embedding_dimension = len(
|
||||
self._embedder.embed_query(BASIC_EMBEDDING_QUERY)
|
||||
)
|
||||
|
||||
def _get_ids(self, num_ids: int) -> list[str]:
|
||||
"""
|
||||
Generate a batch of unique internal document IDs for vector storage.
|
||||
|
||||
Args:
|
||||
num_ids: Number of IDs to create.
|
||||
|
||||
Returns:
|
||||
List of sequentially numbered internal string IDs.
|
||||
"""
|
||||
ids = [
|
||||
f"internal_ai_id_{i}" for i in range(self._next_id, self._next_id + num_ids)
|
||||
]
|
||||
self._next_id += num_ids
|
||||
return ids
|
||||
|
||||
def _make_vector_store(self) -> None:
|
||||
"""
|
||||
Create a backing SQL table for storing vectors if not already created.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Notes:
|
||||
The table name is randomized to avoid collisions.
|
||||
Schema includes content, metadata, and embedding vector.
|
||||
"""
|
||||
if self._table_name is None:
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
table_name = get_random_name(
|
||||
lambda table_name: not table_exists(
|
||||
cursor, self._schema_name, table_name
|
||||
)
|
||||
)
|
||||
|
||||
create_table_stmt = f"""
|
||||
CREATE TABLE {self._schema_name}.{table_name} (
|
||||
`id` VARCHAR(128) NOT NULL,
|
||||
`content` TEXT,
|
||||
`metadata` JSON DEFAULT NULL,
|
||||
`embed` vector(%s),
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB;
|
||||
"""
|
||||
execute_sql(
|
||||
cursor, create_table_stmt, params=(self._embedding_dimension,)
|
||||
)
|
||||
|
||||
self._table_name = table_name
|
||||
|
||||
def delete(self, ids: Optional[Sequence[str]] = None, **_: Any) -> None:
|
||||
"""
|
||||
Delete documents by ID. Optionally deletes the vector table if empty after deletions.
|
||||
|
||||
Args:
|
||||
ids: Optional sequence of document IDs to delete. If None, no action is taken.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Notes:
|
||||
If the backing table is empty after deletions, the table is dropped and
|
||||
table_name is set to None.
|
||||
"""
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
if ids:
|
||||
for _id in ids:
|
||||
execute_sql(
|
||||
cursor,
|
||||
f"DELETE FROM {self._schema_name}.{self._table_name} WHERE id = %s",
|
||||
params=(_id,),
|
||||
)
|
||||
|
||||
if is_table_empty(cursor, self._schema_name, self._table_name):
|
||||
self.delete_all()
|
||||
|
||||
def delete_all(self) -> None:
|
||||
"""
|
||||
Delete and drop the entire vector store table.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if self._table_name is not None:
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
delete_sql_table(cursor, self._schema_name, self._table_name)
|
||||
self._table_name = None
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**_: dict,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Add a batch of text strings and corresponding metadata to the vector store.
|
||||
|
||||
Args:
|
||||
texts: List of strings to embed and store.
|
||||
metadatas: Optional list of metadata dicts (one per text).
|
||||
ids: Optional custom document IDs.
|
||||
|
||||
Returns:
|
||||
List of document IDs corresponding to the added texts.
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Notes:
|
||||
If metadatas is None, an empty dict is assigned to each document.
|
||||
"""
|
||||
texts = list(texts)
|
||||
|
||||
documents = [
|
||||
Document(page_content=text, metadata=meta)
|
||||
for text, meta in zip(texts, metadatas or [{}] * len(texts))
|
||||
]
|
||||
return self.add_documents(documents, ids=ids)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: Iterable[str],
|
||||
embedder: Embeddings,
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
db_connection: MySQLConnectionAbstract = None,
|
||||
) -> VectorStore:
|
||||
"""
|
||||
Construct and populate a MyVectorStore instance from raw texts and metadata.
|
||||
|
||||
Args:
|
||||
texts: List of strings to vectorize and store.
|
||||
embedder: Embeddings generator to use.
|
||||
metadatas: Optional list of metadata dicts per text.
|
||||
db_connection: Active MySQL connection.
|
||||
|
||||
Returns:
|
||||
Instance of MyVectorStore containing the added texts.
|
||||
|
||||
Raises:
|
||||
ValueError: If db_connection is not provided.
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
"""
|
||||
if db_connection is None:
|
||||
raise ValueError(
|
||||
"db_connection must be specified to create a MyVectorStore object"
|
||||
)
|
||||
|
||||
texts = list(texts)
|
||||
|
||||
instance = cls(db_connection=db_connection, embedder=embedder)
|
||||
instance.add_texts(texts, metadatas=metadatas)
|
||||
|
||||
return instance
|
||||
|
||||
def add_documents(
|
||||
self, documents: list[Document], ids: list[str] = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Embed and store Document objects as high-dimensional vectors with metadata.
|
||||
|
||||
Args:
|
||||
documents: List of Document objects (each with 'page_content' and 'metadata').
|
||||
ids: Optional list of explicit document IDs. Must match the length of documents.
|
||||
|
||||
Returns:
|
||||
List of document IDs stored.
|
||||
|
||||
Raises:
|
||||
ValueError: If provided IDs do not match the number of documents.
|
||||
DatabaseError:
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Notes:
|
||||
Automatically creates the backing table if it does not exist.
|
||||
"""
|
||||
if ids and len(ids) != len(documents):
|
||||
msg = (
|
||||
"ids must be the same length as documents. "
|
||||
f"Got {len(ids)} ids and {len(documents)} documents."
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
if len(documents) > 0:
|
||||
self._make_vector_store()
|
||||
else:
|
||||
return []
|
||||
|
||||
if ids is None:
|
||||
ids = self._get_ids(len(documents))
|
||||
|
||||
content = [doc.page_content for doc in documents]
|
||||
vectors = self._embedder.embed_documents(content)
|
||||
|
||||
df = pd.DataFrame()
|
||||
df["id"] = ids
|
||||
df["content"] = content
|
||||
df["embed"] = vectors
|
||||
df["metadata"] = [doc.metadata for doc in documents]
|
||||
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
extend_sql_table(
|
||||
cursor,
|
||||
self._schema_name,
|
||||
self._table_name,
|
||||
df,
|
||||
col_name_to_placeholder_string={"embed": "string_to_vector(%s)"},
|
||||
)
|
||||
|
||||
return ids
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 3,
|
||||
**kwargs: Any,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Search for and return the most similar documents in the store to the given query.
|
||||
|
||||
Args:
|
||||
query: String query to embed and use for similarity search.
|
||||
k: Number of top documents to return.
|
||||
kwargs: options to pass to ML_SIMILARITY_SEARCH. Currently supports
|
||||
distance_metric, max_distance, percentage_distance, and segment_overlap
|
||||
|
||||
Returns:
|
||||
List of Document objects, ordered from most to least similar.
|
||||
|
||||
Raises:
|
||||
DatabaseError:
|
||||
If provided kwargs are invalid or unsupported.
|
||||
If a database connection issue occurs.
|
||||
If an operational error occurs during execution.
|
||||
|
||||
Implementation Notes:
|
||||
- Calls ML similarity search within MySQL using stored procedures.
|
||||
- Retrieves IDs, content, and metadata for search matches.
|
||||
- Parsing and retrieval for context results are handled via intermediate JSONs.
|
||||
"""
|
||||
if self._table_name is None:
|
||||
return []
|
||||
|
||||
embedding = self._embedder.embed_query(query)
|
||||
|
||||
with atomic_transaction(self._db_connection) as cursor:
|
||||
# Set the embedding variable for the similarity search SP
|
||||
execute_sql(
|
||||
cursor,
|
||||
f"SET @{VAR_EMBEDDING} = string_to_vector(%s)",
|
||||
params=[str(embedding)],
|
||||
)
|
||||
|
||||
distance_metric = kwargs.get("distance_metric", "COSINE")
|
||||
retrieval_options = {
|
||||
"max_distance": kwargs.get("max_distance", 0.6),
|
||||
"percentage_distance": kwargs.get("percentage_distance", 20.0),
|
||||
"segment_overlap": kwargs.get("segment_overlap", 0),
|
||||
}
|
||||
|
||||
retrieval_options_placeholder, retrieval_options_params = format_value_sql(
|
||||
retrieval_options
|
||||
)
|
||||
similarity_search_query = f"""
|
||||
CALL sys.ML_SIMILARITY_SEARCH(
|
||||
@{VAR_EMBEDDING},
|
||||
JSON_ARRAY(
|
||||
'{self._schema_name}.{self._table_name}'
|
||||
),
|
||||
JSON_OBJECT(
|
||||
"segment", "content",
|
||||
"segment_embedding", "embed",
|
||||
"document_name", "id"
|
||||
),
|
||||
{k},
|
||||
%s,
|
||||
NULL,
|
||||
NULL,
|
||||
{retrieval_options_placeholder},
|
||||
@{VAR_CONTEXT},
|
||||
@{VAR_CONTEXT_MAP},
|
||||
@{VAR_RETRIEVAL_INFO}
|
||||
)
|
||||
"""
|
||||
|
||||
execute_sql(
|
||||
cursor,
|
||||
similarity_search_query,
|
||||
params=[distance_metric, *retrieval_options_params],
|
||||
)
|
||||
execute_sql(cursor, f"SELECT @{VAR_CONTEXT_MAP}")
|
||||
|
||||
results = []
|
||||
|
||||
context_maps = json.loads(cursor.fetchone()[0])
|
||||
for context in context_maps:
|
||||
execute_sql(
|
||||
cursor,
|
||||
(
|
||||
"SELECT id, content, metadata "
|
||||
f"FROM {self._schema_name}.{self._table_name} "
|
||||
"WHERE id = %s"
|
||||
),
|
||||
params=(context["document_name"],),
|
||||
)
|
||||
doc_id, content, metadata = cursor.fetchone()
|
||||
|
||||
doc_args = {
|
||||
"id": doc_id,
|
||||
"page_content": content,
|
||||
}
|
||||
if metadata is not None:
|
||||
doc_args["metadata"] = json.loads(metadata)
|
||||
|
||||
doc = Document(**doc_args)
|
||||
results.append(doc)
|
||||
|
||||
return results
|
||||
|
||||
def __enter__(self) -> "VectorStore":
|
||||
"""
|
||||
Enter the runtime context related to this vector store instance.
|
||||
|
||||
Returns:
|
||||
The current MyVectorStore object, allowing use within a `with` statement block.
|
||||
|
||||
Usage Notes:
|
||||
- Intended for use in a `with` statement to ensure automatic
|
||||
cleanup of resources.
|
||||
- No special initialization occurs during context entry, but enables
|
||||
proper context-managed lifecycle.
|
||||
|
||||
Example:
|
||||
with MyVectorStore(db_connection, embedder) as vectorstore:
|
||||
vectorstore.add_texts([...])
|
||||
# Vector store is active within this block.
|
||||
# All storage and resources are now cleaned up.
|
||||
"""
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Union[type, None],
|
||||
exc_val: Union[BaseException, None],
|
||||
exc_tb: Union[object, None],
|
||||
) -> None:
|
||||
"""
|
||||
Exit the runtime context for the vector store, ensuring all storage
|
||||
resources are cleaned up.
|
||||
|
||||
Args:
|
||||
exc_type: The exception type, if any exception occurred in the context block.
|
||||
exc_val: The exception value, if any exception occurred in the context block.
|
||||
exc_tb: The traceback object, if any exception occurred in the context block.
|
||||
|
||||
Returns:
|
||||
None: Indicates that exceptions are never suppressed; they will propagate as normal.
|
||||
|
||||
Implementation Notes:
|
||||
- Automatically deletes all vector store data and backing tables via `delete_all()`
|
||||
upon exiting the context.
|
||||
- This cleanup occurs whether the block exits normally or due to an exception.
|
||||
- Does not suppress exceptions; errors in the context block will continue to propagate.
|
||||
- Use when the vector store lifecycle is intended to be temporary or scoped.
|
||||
|
||||
Example:
|
||||
with MyVectorStore(db_connection, embedder) as vectorstore:
|
||||
vectorstore.add_texts([...])
|
||||
# Vector store is active within this block.
|
||||
# All storage and resources are now cleaned up.
|
||||
"""
|
||||
self.delete_all()
|
||||
# No return, so exceptions are never suppressed
|
||||
Reference in New Issue
Block a user