dify/api/providers/vdb/vdb-oceanbase/tests/integration_tests/bench_oceanbase.py

"""
Benchmark: OceanBase vector store — old (single-row) vs new (batch) insertion,
metadata query with/without functional index, and vector search across metrics.

Usage (from repo root):
    uv run --project api python api/packages/dify-vdb-oceanbase/tests/bench_oceanbase.py
"""

import json
import logging
import random
import statistics
import time
import uuid

from pyobvector import VECTOR, ObVecClient, cosine_distance, inner_product, l2_distance
from sqlalchemy import JSON, Column, String, text
from sqlalchemy.dialects.mysql import LONGTEXT

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
HOST = "127.0.0.1"
PORT = 2881
USER = "root@test"
PASSWORD = "difyai123456"
DATABASE = "test"

VEC_DIM = 1536
HNSW_BUILD = {"M": 16, "efConstruction": 256}
DISTANCE_FUNCS = {"l2": l2_distance, "cosine": cosine_distance, "inner_product": inner_product}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_client(**extra):
    return ObVecClient(
        uri=f"{HOST}:{PORT}",
        user=USER,
        password=PASSWORD,
        db_name=DATABASE,
        **extra,
    )


def _rand_vec():
    return [random.uniform(-1, 1) for _ in range(VEC_DIM)]  # noqa: S311


def _drop(client, table):
    client.drop_table_if_exist(table)


def _create_table(client, table, metric="l2"):
    cols = [
        Column("id", String(36), primary_key=True, autoincrement=False),
        Column("vector", VECTOR(VEC_DIM)),
        Column("text", LONGTEXT),
        Column("metadata", JSON),
    ]
    vidx = client.prepare_index_params()
    vidx.add_index(
        field_name="vector",
        index_type="HNSW",
        index_name="vector_index",
        metric_type=metric,
        params=HNSW_BUILD,
    )
    client.create_table_with_index_params(table_name=table, columns=cols, vidxs=vidx)
    client.refresh_metadata([table])


def _gen_rows(n):
    doc_id = str(uuid.uuid4())
    rows = []
    for _ in range(n):
        rows.append(
            {
                "id": str(uuid.uuid4()),
                "vector": _rand_vec(),
                "text": f"benchmark text {uuid.uuid4().hex[:12]}",
                "metadata": json.dumps({"document_id": doc_id, "dataset_id": str(uuid.uuid4())}),
            }
        )
    return rows, doc_id


# ---------------------------------------------------------------------------
# Benchmark: Insertion
# ---------------------------------------------------------------------------
def bench_insert_single(client, table, rows):
    """Old approach: one INSERT per row."""
    t0 = time.perf_counter()
    for row in rows:
        client.insert(table_name=table, data=row)
    return time.perf_counter() - t0


def bench_insert_batch(client, table, rows, batch_size=100):
    """New approach: batch INSERT."""
    t0 = time.perf_counter()
    for start in range(0, len(rows), batch_size):
        batch = rows[start : start + batch_size]
        client.insert(table_name=table, data=batch)
    return time.perf_counter() - t0


# ---------------------------------------------------------------------------
# Benchmark: Metadata query
# ---------------------------------------------------------------------------
def bench_metadata_query(client, table, doc_id, with_index=False):
    """Query by metadata->>'$.document_id' with/without functional index."""
    if with_index:
        try:
            client.perform_raw_text_sql(f"CREATE INDEX idx_metadata_doc_id ON `{table}` ((metadata->>'$.document_id'))")
        except Exception:
            logger.debug("Index idx_metadata_doc_id already exists, skipping creation")

    sql = text(f"SELECT id FROM `{table}` WHERE metadata->>'$.document_id' = :val")
    times = []
    with client.engine.connect() as conn:
        for _ in range(10):
            t0 = time.perf_counter()
            result = conn.execute(sql, {"val": doc_id})
            _ = result.fetchall()
            times.append(time.perf_counter() - t0)
    return times


# ---------------------------------------------------------------------------
# Benchmark: Vector search
# ---------------------------------------------------------------------------
def bench_vector_search(client, table, metric, topk=10, n_queries=20):
    dist_func = DISTANCE_FUNCS[metric]
    times = []
    for _ in range(n_queries):
        q = _rand_vec()
        t0 = time.perf_counter()
        cur = client.ann_search(
            table_name=table,
            vec_column_name="vector",
            vec_data=q,
            topk=topk,
            distance_func=dist_func,
            output_column_names=["text", "metadata"],
            with_dist=True,
        )
        _ = list(cur)
        times.append(time.perf_counter() - t0)
    return times


def _fmt(times):
    """Format list of durations as 'mean ± stdev'."""
    m = statistics.mean(times) * 1000
    s = statistics.stdev(times) * 1000 if len(times) > 1 else 0
    return f"{m:.1f} ± {s:.1f} ms"


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
    client = _make_client()
    client_pooled = _make_client(pool_size=5, max_overflow=10, pool_recycle=3600, pool_pre_ping=True)

    logger.info("=" * 70)
    logger.info("OceanBase Vector Store — Performance Benchmark")
    logger.info("  Endpoint : %s:%s", HOST, PORT)
    logger.info("  Vec dim  : %s", VEC_DIM)
    logger.info("=" * 70)

    # ------------------------------------------------------------------
    # 1. Insertion benchmark
    # ------------------------------------------------------------------
    for n_docs in [100, 500, 1000]:
        rows, doc_id = _gen_rows(n_docs)
        tbl_single = f"bench_single_{n_docs}"
        tbl_batch = f"bench_batch_{n_docs}"

        _drop(client, tbl_single)
        _drop(client, tbl_batch)
        _create_table(client, tbl_single)
        _create_table(client, tbl_batch)

        t_single = bench_insert_single(client, tbl_single, rows)
        t_batch = bench_insert_batch(client_pooled, tbl_batch, rows, batch_size=100)

        speedup = t_single / t_batch if t_batch > 0 else float("inf")
        logger.info("\n[Insert %s docs]", n_docs)
        logger.info("  Single-row : %.2fs", t_single)
        logger.info("  Batch(100) : %.2fs", t_batch)
        logger.info("  Speedup    : %.1fx", speedup)

    # ------------------------------------------------------------------
    # 2. Metadata query benchmark (use the 1000-doc batch table)
    # ------------------------------------------------------------------
    tbl_meta = "bench_batch_1000"
    rows_1000, doc_id_1000 = _gen_rows(1000)
    # The table already has 1000 rows from step 1; use that doc_id
    # Re-query doc_id from one of the rows we inserted
    with client.engine.connect() as conn:
        res = conn.execute(text(f"SELECT metadata->>'$.document_id' FROM `{tbl_meta}` LIMIT 1"))
        doc_id_1000 = res.fetchone()[0]

    logger.info("\n[Metadata filter query — 1000 rows, by document_id]")
    times_no_idx = bench_metadata_query(client, tbl_meta, doc_id_1000, with_index=False)
    logger.info("  Without index : %s", _fmt(times_no_idx))
    times_with_idx = bench_metadata_query(client, tbl_meta, doc_id_1000, with_index=True)
    logger.info("  With index    : %s", _fmt(times_with_idx))

    # ------------------------------------------------------------------
    # 3. Vector search benchmark — across metrics
    # ------------------------------------------------------------------
    logger.info("\n[Vector search — top-10, 20 queries each, on 1000 rows]")

    for metric in ["l2", "cosine", "inner_product"]:
        tbl_vs = f"bench_vs_{metric}"
        _drop(client_pooled, tbl_vs)
        _create_table(client_pooled, tbl_vs, metric=metric)
        # Insert 1000 rows
        rows_vs, _ = _gen_rows(1000)
        bench_insert_batch(client_pooled, tbl_vs, rows_vs, batch_size=100)
        times = bench_vector_search(client_pooled, tbl_vs, metric, topk=10, n_queries=20)
        logger.info("  %-15s: %s", metric, _fmt(times))
        _drop(client_pooled, tbl_vs)

    # ------------------------------------------------------------------
    # Cleanup
    # ------------------------------------------------------------------
    for n in [100, 500, 1000]:
        _drop(client, f"bench_single_{n}")
        _drop(client, f"bench_batch_{n}")

    logger.info("\n%s", "=" * 70)
    logger.info("Benchmark complete.")
    logger.info("=" * 70)


if __name__ == "__main__":
    main()