Embedding Model Comparison Benchmark - EmbedComp¶

Models: e5-base-v2 · bge-base-en-v1.5 · multilingual-e5-base · all-MiniLM-L6-v2
Dataset: BEIR trec-covid corpus
Metrics: Encode throughput · Query latency (mean/p95/p99) · Recall@K · MRR · Cosine distribution

Search/Recommendation System Metrics Overview¶

Metric	What it Measures	Ideal Improvement / Interpretation
Encode throughput	How many documents the model can embed per second (system capacity).	⬆️ Higher (Higher is better)
Query latency (mean)	Average time from "user types query" to "results returned," in milliseconds.	⬇️ Lower (Lower is better)
Query latency (p95)	The time by which 95% of all queries finish—a realistic worst-case for most users.	⬇️ Lower (Lower is better)
Query latency (p99)	The time by which 99% of all queries finish—representing tail latency experienced by the slowest 1 in 100 users.	⬇️ Lower (Lower is better)
Recall@K (e.g., Recall@3)	Measures if a relevant document appears within the top $K$ results. ($\text{ideal} = 1.0$)	⬆️ Higher (Higher is better; ideal = 1.0)
MRR	Mean Reciprocal Rank. How high the relevant document ranks on average (e.g., if always first, rank is 1.0).	⬆️ Higher (Closer to 1.0 is better)
Cosine distribution	The spread of similarity scores across top-K hits. Measures how distinct your best results are from each other.	📈 Mean higher · Spread narrower (Ideal is a high mean with minimal variance)

# Run once to install deps
!pip install sentence-transformers datasets faiss-cpu pandas numpy matplotlib seaborn

import time
import numpy as np
import pandas as pd
import faiss
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Circle
import seaborn as sns
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib style
plt.style.use('dark_background')
sns.set_palette("husl")
print('All imports OK')

Matplotlib is building the font cache; this may take a moment.
/Users/sarav/Documents/Sarav/githubrepo/EmbedComp/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

All imports OK

# ── CONFIG ── swap models / queries freely here ──────────────────────
MODELS = {
    'e5-base-v2':           'intfloat/e5-base-v2',
    'bge-base-en':          'BAAI/bge-base-en-v1.5',
    'multilingual-e5-base': 'intfloat/multilingual-e5-base',
    'all-MiniLM-L6-v2':    'sentence-transformers/all-MiniLM-L6-v2',
    'nomic-embed-v1':       'nomic-ai/nomic-embed-text-v1'
}

QUERY_PREFIX = {
    'e5-base-v2':           'query: ',
    'bge-base-en':          'Represent this sentence for searching relevant passages: ',
    'multilingual-e5-base': 'query: ',
    'all-MiniLM-L6-v2':    '',
    'nomic-embed-v1':               'query: '
}
DOC_PREFIX = {
    'e5-base-v2':           'passage: ',
    'bge-base-en':          '',
    'multilingual-e5-base': 'passage: ',
    'all-MiniLM-L6-v2':    '',
    'nomic-embed-v1':       'passage: '
}

CORPUS_SIZE  = 500   # increase on GPU
BATCH_SIZE   = 16
TOP_K        = 5
LATENCY_RUNS = 10    # number of runs to average latency over

# Ground-truth: query -> list of relevant doc indices in corpus
# These will be computed dynamically using e5-base-v2 as reference
QUERIES = [
    'How does COVID-19 affect lung tissue?',
    'What are the symptoms of coronavirus infection?',
    'How is PCR testing used to detect COVID-19?',
    'What treatments exist for severe COVID cases?',
    'How does the spike protein enable viral entry?',
]
GROUND_TRUTH = {}  # Will be populated after loading corpus

COLORS = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff']
print(f'Config ready: {len(MODELS)} models, {CORPUS_SIZE} docs, {len(GROUND_TRUTH)} queries')

Config ready: 5 models, 500 docs, 0 queries

print('Loading BEIR trec-covid corpus...')
dataset = load_dataset('BeIR/trec-covid', 'corpus')['corpus'].select(range(CORPUS_SIZE))
corpus_texts = [doc['text'] for doc in dataset]
print(f'Loaded {len(corpus_texts)} documents')
print(f'Sample doc: {corpus_texts[0][:150]}...')

# Generate ground truth: find top-10 similar docs for each query using e5-base-v2
print('\nGenerating ground truth (finding top-10 similar docs per query)...')
ref_model = SentenceTransformer('intfloat/e5-base-v2')
corpus_embs = ref_model.encode(['passage: ' + t for t in corpus_texts], 
                                batch_size=BATCH_SIZE, convert_to_numpy=True, 
                                normalize_embeddings=True, show_progress_bar=False)
index_ref = faiss.IndexFlatIP(corpus_embs.shape[1])
index_ref.add(corpus_embs.astype(np.float32))

for query in QUERIES:
    qe = ref_model.encode(['query: ' + query], normalize_embeddings=True)
    D, I = index_ref.search(qe.astype(np.float32), k=10)
    GROUND_TRUTH[query] = I[0].tolist()
    print(f"  '{query[:40]}...' → top-10 indices: {I[0].tolist()}")

del ref_model
print(f'Ground truth ready: {len(GROUND_TRUTH)} queries')

Loading BEIR trec-covid corpus...
Loaded 500 documents
Sample doc: OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae i...

Generating ground truth (finding top-10 similar docs per query)...

Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 6842.03it/s]

  'How does COVID-19 affect lung tissue?...' → top-10 indices: [487, 181, 173, 138, 4, 444, 211, 439, 132, 314]
  'What are the symptoms of coronavirus inf...' → top-10 indices: [408, 487, 464, 388, 439, 16, 338, 488, 426, 173]
  'How is PCR testing used to detect COVID-...' → top-10 indices: [102, 15, 45, 338, 69, 227, 494, 439, 251, 247]
  'What treatments exist for severe COVID c...' → top-10 indices: [252, 249, 128, 338, 105, 354, 94, 439, 6, 225]
  'How does the spike protein enable viral ...' → top-10 indices: [357, 401, 409, 498, 378, 449, 347, 415, 159, 215]
Ground truth ready: 5 queries

def benchmark_model(model_name, model_id):
    print(f"\n{'='*60}\n  {model_name}\n{'='*60}")
    m = {'model': model_name}
    qpfx, dpfx = QUERY_PREFIX[model_name], DOC_PREFIX[model_name]

    # 1. Load
    t0 = time.perf_counter()
    model = SentenceTransformer(model_id)
    m['load_time_s'] = round(time.perf_counter() - t0, 2)
    print(f'  Load         : {m["load_time_s"]}s')

    # 2. Encode corpus -> throughput
    docs = [dpfx + t for t in corpus_texts]
    t0 = time.perf_counter()
    embs = model.encode(docs, batch_size=BATCH_SIZE, convert_to_numpy=True,
                        normalize_embeddings=True, show_progress_bar=False)
    enc_time = time.perf_counter() - t0
    m['encode_time_s']     = round(enc_time, 3)
    m['throughput_docs_s'] = round(len(corpus_texts) / enc_time, 1)
    m['embedding_dim']     = embs.shape[1]
    m['memory_mb']         = round(embs.nbytes / 1e6, 2)
    print(f'  Throughput   : {m["throughput_docs_s"]} docs/s  |  dim={m["embedding_dim"]}  |  {m["memory_mb"]} MB')

    # 3. FAISS index (IndexFlatIP = cosine for normalised vecs)
    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs.astype(np.float32))

    # 4. Query latency — averaged over LATENCY_RUNS
    sample_q = qpfx + list(GROUND_TRUTH.keys())[0]
    lats = []
    for _ in range(LATENCY_RUNS):
        t0 = time.perf_counter()
        qe = model.encode([sample_q], normalize_embeddings=True)
        index.search(qe.astype(np.float32), k=TOP_K)
        lats.append((time.perf_counter() - t0) * 1000)
    m['latency_ms_mean'] = round(np.mean(lats), 2)
    m['latency_ms_p95']  = round(np.percentile(lats, 95), 2)
    m['latency_ms_p99']  = round(np.percentile(lats, 99), 2)
    print(f'  Latency      : mean={m["latency_ms_mean"]}ms  p95={m["latency_ms_p95"]}ms  p99={m["latency_ms_p99"]}ms')

    # 5. Recall@K and MRR
    recall_at = defaultdict(list)
    mrr_list, cosines = [], []
    for qtext, relevant in GROUND_TRUTH.items():
        qe = model.encode([qpfx + qtext], normalize_embeddings=True)
        D, I = index.search(qe.astype(np.float32), k=TOP_K)
        ret = I[0].tolist()
        cosines.extend(D[0].tolist())
        for k in [1, 3, 5]:
            hits = len(set(ret[:k]) & set(relevant))
            recall_at[k].append(hits / min(len(relevant), k))
        rr = next((1/r for r, d in enumerate(ret, 1) if d in relevant), 0.0)
        mrr_list.append(rr)
    m['recall@1'] = round(np.mean(recall_at[1]), 4)
    m['recall@3'] = round(np.mean(recall_at[3]), 4)
    m['recall@5'] = round(np.mean(recall_at[5]), 4)
    m['mrr']      = round(np.mean(mrr_list), 4)
    m['avg_top_cosine'] = round(np.mean(cosines), 4)
    m['cosine_scores']  = cosines
    print(f'  Recall@1/3/5 : {m["recall@1"]} / {m["recall@3"]} / {m["recall@5"]}')
    print(f'  MRR          : {m["mrr"]}  |  Avg cosine: {m["avg_top_cosine"]}')
    del model
    return m

all_metrics = [benchmark_model(name, mid) for name, mid in MODELS.items()]
df = pd.DataFrame([{k: v for k, v in m.items() if k != 'cosine_scores'} for m in all_metrics])
print('\nDone! Summary:')
print(df[['model','throughput_docs_s','latency_ms_mean','recall@5','mrr']].to_string(index=False))

============================================================
  e5-base-v2
============================================================

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 7275.81it/s]

  Load         : 5.87s
  Throughput   : 53.6 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=7.26ms  p95=8.03ms  p99=8.49ms
  Recall@1/3/5 : 1.0 / 1.0 / 1.0
  MRR          : 1.0  |  Avg cosine: 0.8308

============================================================
  bge-base-en
============================================================

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 5949.20it/s]

  Load         : 6.59s
  Throughput   : 52.3 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=20.22ms  p95=81.81ms  p99=125.79ms
  Recall@1/3/5 : 0.8 / 0.5333 / 0.56
  MRR          : 0.84  |  Avg cosine: 0.6298

============================================================
  multilingual-e5-base
============================================================

Loading weights: 100%|██████████| 199/199 [00:00<00:00, 4533.84it/s]

  Load         : 9.04s
  Throughput   : 43.2 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=85.51ms  p95=435.49ms  p99=712.85ms
  Recall@1/3/5 : 1.0 / 0.9333 / 0.76
  MRR          : 1.0  |  Avg cosine: 0.8398

============================================================
  all-MiniLM-L6-v2
============================================================

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 7325.74it/s]

  Load         : 6.77s
  Throughput   : 233.8 docs/s  |  dim=384  |  0.77 MB
  Latency      : mean=38.92ms  p95=193.24ms  p99=314.91ms
  Recall@1/3/5 : 0.6 / 0.4667 / 0.52
  MRR          : 0.7  |  Avg cosine: 0.4388

============================================================
  nomic-embed-v1
============================================================

Loading weights: 100%|██████████| 112/112 [00:00<00:00, 6141.08it/s]

# ── 2-column Matplotlib Dashboard (2 charts per row) ────────────────
model_names = df['model'].tolist()
colors_palette = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff'][:len(model_names)]

fig, axes = plt.subplots(5, 2, figsize=(16, 28))
fig.patch.set_facecolor('#0d1117')

# R1C1: Throughput
ax = axes[0, 0]
bars = ax.bar(model_names, df['throughput_docs_s'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Throughput (docs/sec) ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('docs/sec', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R1C2: MRR
ax = axes[0, 1]
bars = ax.bar(model_names, df['mrr'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Mean Reciprocal Rank (MRR) ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('MRR', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R2C1: Latency (Mean, p95, p99)
ax = axes[1, 0]
x = np.arange(len(model_names))
width = 0.25
ax.bar(x - width, df['latency_ms_mean'], width, label='Mean', color=colors_palette, alpha=1.0, edgecolor='white', linewidth=0.5)
ax.bar(x, df['latency_ms_p95'], width, label='p95', color=colors_palette, alpha=0.65, edgecolor='white', linewidth=0.5)
ax.bar(x + width, df['latency_ms_p99'], width, label='p99', color=colors_palette, alpha=0.4, edgecolor='white', linewidth=0.5)
ax.set_title('Query Latency mean/p95/p99 (ms) ↓', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Latency (ms)', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.legend(fontsize=8, loc='upper right')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')

# R2C2: Recall (R@1, R@3, R@5)
ax = axes[1, 1]
x = np.arange(len(model_names))
ax.bar(x - width, df['recall@1'], width, label='R@1', color=colors_palette, alpha=1.0, edgecolor='white', linewidth=0.5)
ax.bar(x, df['recall@3'], width, label='R@3', color=colors_palette, alpha=0.65, edgecolor='white', linewidth=0.5)
ax.bar(x + width, df['recall@5'], width, label='R@5', color=colors_palette, alpha=0.4, edgecolor='white', linewidth=0.5)
ax.set_title('Recall@1 / @3 / @5 ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Recall', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.legend(fontsize=8, loc='lower right')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')

# R3C1: Embedding Dimension
ax = axes[2, 0]
bars = ax.bar(model_names, df['embedding_dim'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Embedding Dimension', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Dimension', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R3C2: Cosine Distribution (Box plot)
ax = axes[2, 1]
cosine_data = [m['cosine_scores'] for m in all_metrics]
bp = ax.boxplot(cosine_data, patch_artist=True, widths=0.6)
for patch, color in zip(bp['boxes'], colors_palette):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
for whisker in bp['whiskers']:
    whisker.set_color('#e6edf3')
for median in bp['medians']:
    median.set_color('#e6edf3')
    median.set_linewidth(2)
ax.set_title('Cosine Score Distribution', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Cosine Scores', fontsize=10)
ax.set_xticks(range(1, len(model_names) + 1))
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')

# R4C1: Index Memory
ax = axes[3, 0]
bars = ax.bar(model_names, df['memory_mb'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Index Memory (MB) ↓', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Memory (MB)', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R4C2: Model Load Time
ax = axes[3, 1]
bars = ax.bar(model_names, df['load_time_s'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Model Load Time (s) ↓', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Load Time (s)', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R5C1: Average Top-5 Cosine
ax = axes[4, 0]
bars = ax.bar(model_names, df['avg_top_cosine'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Avg Top-5 Cosine ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Cosine Score', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R5C2: Empty (hide)
axes[4, 1].axis('off')

fig.suptitle('Embedding Model Comparison Dashboard\nBEIR trec-covid · 500 docs · 5 queries · 10 latency runs',
             fontsize=16, fontweight='bold', y=0.995, color='#e6edf3')
plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.subplots_adjust(hspace=0.5)
plt.show()

# ── Radar chart — overall profile per model ──────────────────────────
import numpy as np
from math import pi

def minmax_norm(s, invert=False):
    lo, hi = s.min(), s.max()
    if hi == lo: return [0.5]*len(s)
    n = (s - lo)/(hi - lo)
    return (1-n).tolist() if invert else n.tolist()

axes_dict = {
    'Throughput':  minmax_norm(df['throughput_docs_s']),
    'Low Latency': minmax_norm(df['latency_ms_mean'], invert=True),
    'Recall@5':    minmax_norm(df['recall@5']),
    'MRR':         minmax_norm(df['mrr']),
    'Cosine Qual': minmax_norm(df['avg_top_cosine']),
    'Low Memory':  minmax_norm(df['memory_mb'], invert=True),
}
cats = list(axes_dict.keys())
num_vars = len(cats)

# Compute angle for each axis
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

# Create polar subplot
fig, ax = plt.subplots(figsize=(12, 10), subplot_kw=dict(projection='polar'))
fig.patch.set_facecolor('#0d1117')
ax.set_facecolor('#161b22')

colors = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff'][:len(model_names)]

for i, mn in enumerate(model_names):
    vals = [axes_dict[a][i] for a in cats]
    vals += vals[:1]
    ax.plot(angles, vals, 'o-', linewidth=2.5, label=mn, color=colors[i])
    ax.fill(angles, vals, alpha=0.25, color=colors[i])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(cats, size=11, color='#e6edf3')
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], size=8, color='#a0aec0')
ax.grid(True, color='#30363d', alpha=0.3)

plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10, framealpha=0.9)
plt.title('Overall Model Profile\nAll metrics normalised 0–1 (higher = better on every axis)',
          fontsize=14, fontweight='bold', color='#e6edf3', pad=20)
plt.tight_layout()
plt.show()

# ── Latency vs Recall@5 bubble chart — the sweet-spot view ───────────
fig, ax = plt.subplots(figsize=(12, 7))
fig.patch.set_facecolor('#0d1117')
ax.set_facecolor('#161b22')

colors = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff'][:len(model_names)]
sizes = (df['throughput_docs_s'] / df['throughput_docs_s'].max()) * 1000  # Scale bubble sizes

scatter = ax.scatter(df['latency_ms_mean'], df['recall@5'], s=sizes, c=colors, alpha=0.6, 
                     edgecolors='white', linewidth=2)

# Add model name labels
for idx, row in df.iterrows():
    ax.annotate(row['model'], 
                xy=(row['latency_ms_mean'], row['recall@5']),
                xytext=(5, 5), textcoords='offset points', 
                fontsize=10, color='#e6edf3', fontweight='bold')

ax.set_xlabel('Query Latency mean (ms) ↓', fontsize=12, fontweight='bold', color='#e6edf3')
ax.set_ylabel('Recall@5 ↑', fontsize=12, fontweight='bold', color='#e6edf3')
ax.set_title('Latency vs Recall@5 (bubble size = throughput docs/sec)\nIdeal: bottom-right — low latency, high recall',
             fontsize=13, fontweight='bold', color='#e6edf3', pad=15)

ax.grid(True, alpha=0.2, color='#30363d')
ax.tick_params(colors='#e6edf3', labelsize=10)

# Add legend for bubble sizes
legend_sizes = [df['throughput_docs_s'].min(), df['throughput_docs_s'].median(), df['throughput_docs_s'].max()]
legend_bubbles = []
for size in legend_sizes:
    legend_bubbles.append(ax.scatter([], [], s=(size/df['throughput_docs_s'].max())*1000, 
                                     c='#e6edf3', alpha=0.6, edgecolors='white', linewidth=1.5))
ax.legend(legend_bubbles, [f'{s:.0f} docs/s' for s in legend_sizes], 
          scatterpoints=1, frameon=True, labelspacing=2, title='Throughput', 
          loc='lower left', fontsize=9, title_fontsize=10, framealpha=0.9)

plt.tight_layout()
plt.show()

# ── Summary table + winners ──────────────────────────────────────────
summary = df[['model','embedding_dim','throughput_docs_s','latency_ms_mean',
              'latency_ms_p95','recall@1','recall@3','recall@5','mrr',
              'avg_top_cosine','memory_mb','load_time_s']].copy()
summary.columns = ['Model','Dim','Thru\n(docs/s)','Latency\nMean(ms)','Latency\np95(ms)',
                   'R@1','R@3','R@5','MRR','AvgCos','Mem\n(MB)','Load\n(s)']

# Create figure for table
fig, ax = plt.subplots(figsize=(16, 6))
fig.patch.set_facecolor('#0d1117')
ax.axis('tight')
ax.axis('off')

# Convert to list format for table
table_data = [summary.columns.tolist()]
for idx, row in summary.iterrows():
    table_data.append(row.tolist())

# Create table
table = ax.table(cellText=table_data, cellLoc='center', loc='center',
                colWidths=[0.10, 0.08, 0.10, 0.10, 0.10, 0.08, 0.08, 0.08, 0.08, 0.10, 0.08, 0.08])

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2.5)

# Color header row
for i in range(len(summary.columns)):
    cell = table[(0, i)]
    cell.set_facecolor('#3fb950')
    cell.set_text_props(weight='bold', color='#0d1117')

# Color data rows alternately
colors_alt = ['#161b22', '#1c2128']
for i in range(1, len(table_data)):
    for j in range(len(summary.columns)):
        cell = table[(i, j)]
        cell.set_facecolor(colors_alt[i % 2])
        cell.set_text_props(color='#e6edf3')
        cell.set_edgecolor('#30363d')

plt.title('EMBEDDING MODEL COMPARISON — FINAL SUMMARY', 
          fontsize=14, fontweight='bold', color='#e6edf3', pad=20)
plt.tight_layout()
plt.show()

# Create Winners table
winners_data = [['Metric', 'Winner Model', 'Score']]
for metric, col, better in [
    ('🚀 Throughput','throughput_docs_s','max'),
    ('⚡ Latency','latency_ms_mean','min'),
    ('🎯 Recall@5','recall@5','max'),
    ('📊 MRR','mrr','max'),
    ('✨ Cosine Quality','avg_top_cosine','max'),
    ('💾 Low Memory','memory_mb','min'),
]:
    idx = df[col].idxmax() if better=='max' else df[col].idxmin()
    value = df.loc[idx, col]
    model = df.loc[idx, 'model']
    winners_data.append([metric, model, f'{value}'])

# Create figure for winners table
fig, ax = plt.subplots(figsize=(12, 5))
fig.patch.set_facecolor('#0d1117')
ax.axis('tight')
ax.axis('off')

# Create winners table
winners_table = ax.table(cellText=winners_data, cellLoc='center', loc='center',
                        colWidths=[0.35, 0.35, 0.30])

# Style winners table
winners_table.auto_set_font_size(False)
winners_table.set_fontsize(11)
winners_table.scale(1, 2.8)

# Color header row
for i in range(3):
    cell = winners_table[(0, i)]
    cell.set_facecolor('#a371f7')
    cell.set_text_props(weight='bold', color='#0d1117', size=12)

# Color data rows with gradient effect
colors_winners = ['#161b22', '#1c2128']
for i in range(1, len(winners_data)):
    for j in range(3):
        cell = winners_table[(i, j)]
        cell.set_facecolor(colors_winners[i % 2])
        cell.set_text_props(color='#e6edf3', weight='bold' if j == 1 else 'normal')
        cell.set_edgecolor('#30363d')

plt.title('WINNERS BY METRIC', 
          fontsize=14, fontweight='bold', color='#e6edf3', pad=20)
plt.tight_layout()
plt.show()