Embedding Model Comparison Benchmark - EmbedComp

Models: e5-base-v2 · bge-base-en-v1.5 · multilingual-e5-base · all-MiniLM-L6-v2
Dataset: BEIR trec-covid corpus
Metrics: Encode throughput · Query latency (mean/p95/p99) · Recall@K · MRR · Cosine distribution

Search/Recommendation System Metrics Overview

Metric What it Measures Ideal Improvement / Interpretation
Encode throughput How many documents the model can embed per second (system capacity). ⬆️ Higher (Higher is better)
Query latency (mean) Average time from "user types query" to "results returned," in milliseconds. ⬇️ Lower (Lower is better)
Query latency (p95) The time by which 95% of all queries finish—a realistic worst-case for most users. ⬇️ Lower (Lower is better)
Query latency (p99) The time by which 99% of all queries finish—representing tail latency experienced by the slowest 1 in 100 users. ⬇️ Lower (Lower is better)
Recall@K (e.g., Recall@3) Measures if a relevant document appears within the top $K$ results. ($\text{ideal} = 1.0$) ⬆️ Higher (Higher is better; ideal = 1.0)
MRR Mean Reciprocal Rank. How high the relevant document ranks on average (e.g., if always first, rank is 1.0). ⬆️ Higher (Closer to 1.0 is better)
Cosine distribution The spread of similarity scores across top-K hits. Measures how distinct your best results are from each other. 📈 Mean higher · Spread narrower (Ideal is a high mean with minimal variance)
In [ ]:
# Run once to install deps
!pip install sentence-transformers datasets faiss-cpu pandas numpy matplotlib seaborn
In [1]:
import time
import numpy as np
import pandas as pd
import faiss
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import Circle
import seaborn as sns
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib style
plt.style.use('dark_background')
sns.set_palette("husl")
print('All imports OK')
Matplotlib is building the font cache; this may take a moment.
/Users/sarav/Documents/Sarav/githubrepo/EmbedComp/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
All imports OK
In [3]:
# ── CONFIG ── swap models / queries freely here ──────────────────────
MODELS = {
    'e5-base-v2':           'intfloat/e5-base-v2',
    'bge-base-en':          'BAAI/bge-base-en-v1.5',
    'multilingual-e5-base': 'intfloat/multilingual-e5-base',
    'all-MiniLM-L6-v2':    'sentence-transformers/all-MiniLM-L6-v2',
    'nomic-embed-v1':       'nomic-ai/nomic-embed-text-v1'
}

QUERY_PREFIX = {
    'e5-base-v2':           'query: ',
    'bge-base-en':          'Represent this sentence for searching relevant passages: ',
    'multilingual-e5-base': 'query: ',
    'all-MiniLM-L6-v2':    '',
    'nomic-embed-v1':               'query: '
}
DOC_PREFIX = {
    'e5-base-v2':           'passage: ',
    'bge-base-en':          '',
    'multilingual-e5-base': 'passage: ',
    'all-MiniLM-L6-v2':    '',
    'nomic-embed-v1':       'passage: '
}

CORPUS_SIZE  = 500   # increase on GPU
BATCH_SIZE   = 16
TOP_K        = 5
LATENCY_RUNS = 10    # number of runs to average latency over

# Ground-truth: query -> list of relevant doc indices in corpus
# These will be computed dynamically using e5-base-v2 as reference
QUERIES = [
    'How does COVID-19 affect lung tissue?',
    'What are the symptoms of coronavirus infection?',
    'How is PCR testing used to detect COVID-19?',
    'What treatments exist for severe COVID cases?',
    'How does the spike protein enable viral entry?',
]
GROUND_TRUTH = {}  # Will be populated after loading corpus

COLORS = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff']
print(f'Config ready: {len(MODELS)} models, {CORPUS_SIZE} docs, {len(GROUND_TRUTH)} queries')
Config ready: 5 models, 500 docs, 0 queries
In [4]:
print('Loading BEIR trec-covid corpus...')
dataset = load_dataset('BeIR/trec-covid', 'corpus')['corpus'].select(range(CORPUS_SIZE))
corpus_texts = [doc['text'] for doc in dataset]
print(f'Loaded {len(corpus_texts)} documents')
print(f'Sample doc: {corpus_texts[0][:150]}...')

# Generate ground truth: find top-10 similar docs for each query using e5-base-v2
print('\nGenerating ground truth (finding top-10 similar docs per query)...')
ref_model = SentenceTransformer('intfloat/e5-base-v2')
corpus_embs = ref_model.encode(['passage: ' + t for t in corpus_texts], 
                                batch_size=BATCH_SIZE, convert_to_numpy=True, 
                                normalize_embeddings=True, show_progress_bar=False)
index_ref = faiss.IndexFlatIP(corpus_embs.shape[1])
index_ref.add(corpus_embs.astype(np.float32))

for query in QUERIES:
    qe = ref_model.encode(['query: ' + query], normalize_embeddings=True)
    D, I = index_ref.search(qe.astype(np.float32), k=10)
    GROUND_TRUTH[query] = I[0].tolist()
    print(f"  '{query[:40]}...' → top-10 indices: {I[0].tolist()}")

del ref_model
print(f'Ground truth ready: {len(GROUND_TRUTH)} queries')
Loading BEIR trec-covid corpus...
Loaded 500 documents
Sample doc: OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae i...

Generating ground truth (finding top-10 similar docs per query)...
Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 6842.03it/s]
  'How does COVID-19 affect lung tissue?...' → top-10 indices: [487, 181, 173, 138, 4, 444, 211, 439, 132, 314]
  'What are the symptoms of coronavirus inf...' → top-10 indices: [408, 487, 464, 388, 439, 16, 338, 488, 426, 173]
  'How is PCR testing used to detect COVID-...' → top-10 indices: [102, 15, 45, 338, 69, 227, 494, 439, 251, 247]
  'What treatments exist for severe COVID c...' → top-10 indices: [252, 249, 128, 338, 105, 354, 94, 439, 6, 225]
  'How does the spike protein enable viral ...' → top-10 indices: [357, 401, 409, 498, 378, 449, 347, 415, 159, 215]
Ground truth ready: 5 queries
In [5]:
def benchmark_model(model_name, model_id):
    print(f"\n{'='*60}\n  {model_name}\n{'='*60}")
    m = {'model': model_name}
    qpfx, dpfx = QUERY_PREFIX[model_name], DOC_PREFIX[model_name]

    # 1. Load
    t0 = time.perf_counter()
    model = SentenceTransformer(model_id)
    m['load_time_s'] = round(time.perf_counter() - t0, 2)
    print(f'  Load         : {m["load_time_s"]}s')

    # 2. Encode corpus -> throughput
    docs = [dpfx + t for t in corpus_texts]
    t0 = time.perf_counter()
    embs = model.encode(docs, batch_size=BATCH_SIZE, convert_to_numpy=True,
                        normalize_embeddings=True, show_progress_bar=False)
    enc_time = time.perf_counter() - t0
    m['encode_time_s']     = round(enc_time, 3)
    m['throughput_docs_s'] = round(len(corpus_texts) / enc_time, 1)
    m['embedding_dim']     = embs.shape[1]
    m['memory_mb']         = round(embs.nbytes / 1e6, 2)
    print(f'  Throughput   : {m["throughput_docs_s"]} docs/s  |  dim={m["embedding_dim"]}  |  {m["memory_mb"]} MB')

    # 3. FAISS index (IndexFlatIP = cosine for normalised vecs)
    index = faiss.IndexFlatIP(embs.shape[1])
    index.add(embs.astype(np.float32))

    # 4. Query latency — averaged over LATENCY_RUNS
    sample_q = qpfx + list(GROUND_TRUTH.keys())[0]
    lats = []
    for _ in range(LATENCY_RUNS):
        t0 = time.perf_counter()
        qe = model.encode([sample_q], normalize_embeddings=True)
        index.search(qe.astype(np.float32), k=TOP_K)
        lats.append((time.perf_counter() - t0) * 1000)
    m['latency_ms_mean'] = round(np.mean(lats), 2)
    m['latency_ms_p95']  = round(np.percentile(lats, 95), 2)
    m['latency_ms_p99']  = round(np.percentile(lats, 99), 2)
    print(f'  Latency      : mean={m["latency_ms_mean"]}ms  p95={m["latency_ms_p95"]}ms  p99={m["latency_ms_p99"]}ms')

    # 5. Recall@K and MRR
    recall_at = defaultdict(list)
    mrr_list, cosines = [], []
    for qtext, relevant in GROUND_TRUTH.items():
        qe = model.encode([qpfx + qtext], normalize_embeddings=True)
        D, I = index.search(qe.astype(np.float32), k=TOP_K)
        ret = I[0].tolist()
        cosines.extend(D[0].tolist())
        for k in [1, 3, 5]:
            hits = len(set(ret[:k]) & set(relevant))
            recall_at[k].append(hits / min(len(relevant), k))
        rr = next((1/r for r, d in enumerate(ret, 1) if d in relevant), 0.0)
        mrr_list.append(rr)
    m['recall@1'] = round(np.mean(recall_at[1]), 4)
    m['recall@3'] = round(np.mean(recall_at[3]), 4)
    m['recall@5'] = round(np.mean(recall_at[5]), 4)
    m['mrr']      = round(np.mean(mrr_list), 4)
    m['avg_top_cosine'] = round(np.mean(cosines), 4)
    m['cosine_scores']  = cosines
    print(f'  Recall@1/3/5 : {m["recall@1"]} / {m["recall@3"]} / {m["recall@5"]}')
    print(f'  MRR          : {m["mrr"]}  |  Avg cosine: {m["avg_top_cosine"]}')
    del model
    return m

all_metrics = [benchmark_model(name, mid) for name, mid in MODELS.items()]
df = pd.DataFrame([{k: v for k, v in m.items() if k != 'cosine_scores'} for m in all_metrics])
print('\nDone! Summary:')
print(df[['model','throughput_docs_s','latency_ms_mean','recall@5','mrr']].to_string(index=False))
============================================================
  e5-base-v2
============================================================
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 7275.81it/s]
  Load         : 5.87s
  Throughput   : 53.6 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=7.26ms  p95=8.03ms  p99=8.49ms
  Recall@1/3/5 : 1.0 / 1.0 / 1.0
  MRR          : 1.0  |  Avg cosine: 0.8308

============================================================
  bge-base-en
============================================================
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 5949.20it/s]
  Load         : 6.59s
  Throughput   : 52.3 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=20.22ms  p95=81.81ms  p99=125.79ms
  Recall@1/3/5 : 0.8 / 0.5333 / 0.56
  MRR          : 0.84  |  Avg cosine: 0.6298

============================================================
  multilingual-e5-base
============================================================
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 4533.84it/s]
  Load         : 9.04s
  Throughput   : 43.2 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=85.51ms  p95=435.49ms  p99=712.85ms
  Recall@1/3/5 : 1.0 / 0.9333 / 0.76
  MRR          : 1.0  |  Avg cosine: 0.8398

============================================================
  all-MiniLM-L6-v2
============================================================
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 7325.74it/s]
  Load         : 6.77s
  Throughput   : 233.8 docs/s  |  dim=384  |  0.77 MB
  Latency      : mean=38.92ms  p95=193.24ms  p99=314.91ms
  Recall@1/3/5 : 0.6 / 0.4667 / 0.52
  MRR          : 0.7  |  Avg cosine: 0.4388

============================================================
  nomic-embed-v1
============================================================
Loading weights: 100%|██████████| 112/112 [00:00<00:00, 6141.08it/s]
  Load         : 6.25s
  Throughput   : 33.5 docs/s  |  dim=768  |  1.54 MB
  Latency      : mean=29.38ms  p95=121.07ms  p99=194.06ms
  Recall@1/3/5 : 0.8 / 0.8 / 0.6
  MRR          : 0.9  |  Avg cosine: 0.5624

Done! Summary:
               model  throughput_docs_s  latency_ms_mean  recall@5  mrr
          e5-base-v2               53.6             7.26      1.00 1.00
         bge-base-en               52.3            20.22      0.56 0.84
multilingual-e5-base               43.2            85.51      0.76 1.00
    all-MiniLM-L6-v2              233.8            38.92      0.52 0.70
      nomic-embed-v1               33.5            29.38      0.60 0.90
In [18]:
# ── 2-column Matplotlib Dashboard (2 charts per row) ────────────────
model_names = df['model'].tolist()
colors_palette = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff'][:len(model_names)]

fig, axes = plt.subplots(5, 2, figsize=(16, 28))
fig.patch.set_facecolor('#0d1117')

# R1C1: Throughput
ax = axes[0, 0]
bars = ax.bar(model_names, df['throughput_docs_s'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Throughput (docs/sec) ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('docs/sec', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R1C2: MRR
ax = axes[0, 1]
bars = ax.bar(model_names, df['mrr'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Mean Reciprocal Rank (MRR) ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('MRR', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R2C1: Latency (Mean, p95, p99)
ax = axes[1, 0]
x = np.arange(len(model_names))
width = 0.25
ax.bar(x - width, df['latency_ms_mean'], width, label='Mean', color=colors_palette, alpha=1.0, edgecolor='white', linewidth=0.5)
ax.bar(x, df['latency_ms_p95'], width, label='p95', color=colors_palette, alpha=0.65, edgecolor='white', linewidth=0.5)
ax.bar(x + width, df['latency_ms_p99'], width, label='p99', color=colors_palette, alpha=0.4, edgecolor='white', linewidth=0.5)
ax.set_title('Query Latency mean/p95/p99 (ms) ↓', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Latency (ms)', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.legend(fontsize=8, loc='upper right')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')

# R2C2: Recall (R@1, R@3, R@5)
ax = axes[1, 1]
x = np.arange(len(model_names))
ax.bar(x - width, df['recall@1'], width, label='R@1', color=colors_palette, alpha=1.0, edgecolor='white', linewidth=0.5)
ax.bar(x, df['recall@3'], width, label='R@3', color=colors_palette, alpha=0.65, edgecolor='white', linewidth=0.5)
ax.bar(x + width, df['recall@5'], width, label='R@5', color=colors_palette, alpha=0.4, edgecolor='white', linewidth=0.5)
ax.set_title('Recall@1 / @3 / @5 ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Recall', fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.legend(fontsize=8, loc='lower right')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')

# R3C1: Embedding Dimension
ax = axes[2, 0]
bars = ax.bar(model_names, df['embedding_dim'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Embedding Dimension', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Dimension', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R3C2: Cosine Distribution (Box plot)
ax = axes[2, 1]
cosine_data = [m['cosine_scores'] for m in all_metrics]
bp = ax.boxplot(cosine_data, patch_artist=True, widths=0.6)
for patch, color in zip(bp['boxes'], colors_palette):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
for whisker in bp['whiskers']:
    whisker.set_color('#e6edf3')
for median in bp['medians']:
    median.set_color('#e6edf3')
    median.set_linewidth(2)
ax.set_title('Cosine Score Distribution', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Cosine Scores', fontsize=10)
ax.set_xticks(range(1, len(model_names) + 1))
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')

# R4C1: Index Memory
ax = axes[3, 0]
bars = ax.bar(model_names, df['memory_mb'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Index Memory (MB) ↓', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Memory (MB)', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R4C2: Model Load Time
ax = axes[3, 1]
bars = ax.bar(model_names, df['load_time_s'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Model Load Time (s) ↓', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Load Time (s)', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R5C1: Average Top-5 Cosine
ax = axes[4, 0]
bars = ax.bar(model_names, df['avg_top_cosine'], color=colors_palette, alpha=0.8, edgecolor='white', linewidth=1.5)
ax.set_title('Avg Top-5 Cosine ↑', fontsize=12, fontweight='bold', pad=10)
ax.set_ylabel('Cosine Score', fontsize=10)
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}', ha='center', va='bottom', fontsize=9, color='#e6edf3')
ax.set_facecolor('#161b22')
ax.grid(axis='y', alpha=0.3, color='#30363d')
ax.tick_params(axis='x', rotation=45, labelsize=10)

# R5C2: Empty (hide)
axes[4, 1].axis('off')

fig.suptitle('Embedding Model Comparison Dashboard\nBEIR trec-covid · 500 docs · 5 queries · 10 latency runs',
             fontsize=16, fontweight='bold', y=0.995, color='#e6edf3')
plt.tight_layout(rect=[0, 0, 1, 0.99])
plt.subplots_adjust(hspace=0.5)
plt.show()
No description has been provided for this image
In [16]:
# ── Radar chart — overall profile per model ──────────────────────────
import numpy as np
from math import pi

def minmax_norm(s, invert=False):
    lo, hi = s.min(), s.max()
    if hi == lo: return [0.5]*len(s)
    n = (s - lo)/(hi - lo)
    return (1-n).tolist() if invert else n.tolist()

axes_dict = {
    'Throughput':  minmax_norm(df['throughput_docs_s']),
    'Low Latency': minmax_norm(df['latency_ms_mean'], invert=True),
    'Recall@5':    minmax_norm(df['recall@5']),
    'MRR':         minmax_norm(df['mrr']),
    'Cosine Qual': minmax_norm(df['avg_top_cosine']),
    'Low Memory':  minmax_norm(df['memory_mb'], invert=True),
}
cats = list(axes_dict.keys())
num_vars = len(cats)

# Compute angle for each axis
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

# Create polar subplot
fig, ax = plt.subplots(figsize=(12, 10), subplot_kw=dict(projection='polar'))
fig.patch.set_facecolor('#0d1117')
ax.set_facecolor('#161b22')

colors = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff'][:len(model_names)]

for i, mn in enumerate(model_names):
    vals = [axes_dict[a][i] for a in cats]
    vals += vals[:1]
    ax.plot(angles, vals, 'o-', linewidth=2.5, label=mn, color=colors[i])
    ax.fill(angles, vals, alpha=0.25, color=colors[i])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(cats, size=11, color='#e6edf3')
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], size=8, color='#a0aec0')
ax.grid(True, color='#30363d', alpha=0.3)

plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10, framealpha=0.9)
plt.title('Overall Model Profile\nAll metrics normalised 0–1 (higher = better on every axis)',
          fontsize=14, fontweight='bold', color='#e6edf3', pad=20)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [13]:
# ── Latency vs Recall@5 bubble chart — the sweet-spot view ───────────
fig, ax = plt.subplots(figsize=(12, 7))
fig.patch.set_facecolor('#0d1117')
ax.set_facecolor('#161b22')

colors = ['#58a6ff', '#3fb950', '#a371f7', '#f0883e', '#79c0ff'][:len(model_names)]
sizes = (df['throughput_docs_s'] / df['throughput_docs_s'].max()) * 1000  # Scale bubble sizes

scatter = ax.scatter(df['latency_ms_mean'], df['recall@5'], s=sizes, c=colors, alpha=0.6, 
                     edgecolors='white', linewidth=2)

# Add model name labels
for idx, row in df.iterrows():
    ax.annotate(row['model'], 
                xy=(row['latency_ms_mean'], row['recall@5']),
                xytext=(5, 5), textcoords='offset points', 
                fontsize=10, color='#e6edf3', fontweight='bold')

ax.set_xlabel('Query Latency mean (ms) ↓', fontsize=12, fontweight='bold', color='#e6edf3')
ax.set_ylabel('Recall@5 ↑', fontsize=12, fontweight='bold', color='#e6edf3')
ax.set_title('Latency vs Recall@5 (bubble size = throughput docs/sec)\nIdeal: bottom-right — low latency, high recall',
             fontsize=13, fontweight='bold', color='#e6edf3', pad=15)

ax.grid(True, alpha=0.2, color='#30363d')
ax.tick_params(colors='#e6edf3', labelsize=10)

# Add legend for bubble sizes
legend_sizes = [df['throughput_docs_s'].min(), df['throughput_docs_s'].median(), df['throughput_docs_s'].max()]
legend_bubbles = []
for size in legend_sizes:
    legend_bubbles.append(ax.scatter([], [], s=(size/df['throughput_docs_s'].max())*1000, 
                                     c='#e6edf3', alpha=0.6, edgecolors='white', linewidth=1.5))
ax.legend(legend_bubbles, [f'{s:.0f} docs/s' for s in legend_sizes], 
          scatterpoints=1, frameon=True, labelspacing=2, title='Throughput', 
          loc='lower left', fontsize=9, title_fontsize=10, framealpha=0.9)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [20]:
# ── Summary table + winners ──────────────────────────────────────────
summary = df[['model','embedding_dim','throughput_docs_s','latency_ms_mean',
              'latency_ms_p95','recall@1','recall@3','recall@5','mrr',
              'avg_top_cosine','memory_mb','load_time_s']].copy()
summary.columns = ['Model','Dim','Thru\n(docs/s)','Latency\nMean(ms)','Latency\np95(ms)',
                   'R@1','R@3','R@5','MRR','AvgCos','Mem\n(MB)','Load\n(s)']

# Create figure for table
fig, ax = plt.subplots(figsize=(16, 6))
fig.patch.set_facecolor('#0d1117')
ax.axis('tight')
ax.axis('off')

# Convert to list format for table
table_data = [summary.columns.tolist()]
for idx, row in summary.iterrows():
    table_data.append(row.tolist())

# Create table
table = ax.table(cellText=table_data, cellLoc='center', loc='center',
                colWidths=[0.10, 0.08, 0.10, 0.10, 0.10, 0.08, 0.08, 0.08, 0.08, 0.10, 0.08, 0.08])

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2.5)

# Color header row
for i in range(len(summary.columns)):
    cell = table[(0, i)]
    cell.set_facecolor('#3fb950')
    cell.set_text_props(weight='bold', color='#0d1117')

# Color data rows alternately
colors_alt = ['#161b22', '#1c2128']
for i in range(1, len(table_data)):
    for j in range(len(summary.columns)):
        cell = table[(i, j)]
        cell.set_facecolor(colors_alt[i % 2])
        cell.set_text_props(color='#e6edf3')
        cell.set_edgecolor('#30363d')

plt.title('EMBEDDING MODEL COMPARISON — FINAL SUMMARY', 
          fontsize=14, fontweight='bold', color='#e6edf3', pad=20)
plt.tight_layout()
plt.show()

# Create Winners table
winners_data = [['Metric', 'Winner Model', 'Score']]
for metric, col, better in [
    ('🚀 Throughput','throughput_docs_s','max'),
    ('⚡ Latency','latency_ms_mean','min'),
    ('🎯 Recall@5','recall@5','max'),
    ('📊 MRR','mrr','max'),
    ('✨ Cosine Quality','avg_top_cosine','max'),
    ('💾 Low Memory','memory_mb','min'),
]:
    idx = df[col].idxmax() if better=='max' else df[col].idxmin()
    value = df.loc[idx, col]
    model = df.loc[idx, 'model']
    winners_data.append([metric, model, f'{value}'])

# Create figure for winners table
fig, ax = plt.subplots(figsize=(12, 5))
fig.patch.set_facecolor('#0d1117')
ax.axis('tight')
ax.axis('off')

# Create winners table
winners_table = ax.table(cellText=winners_data, cellLoc='center', loc='center',
                        colWidths=[0.35, 0.35, 0.30])

# Style winners table
winners_table.auto_set_font_size(False)
winners_table.set_fontsize(11)
winners_table.scale(1, 2.8)

# Color header row
for i in range(3):
    cell = winners_table[(0, i)]
    cell.set_facecolor('#a371f7')
    cell.set_text_props(weight='bold', color='#0d1117', size=12)

# Color data rows with gradient effect
colors_winners = ['#161b22', '#1c2128']
for i in range(1, len(winners_data)):
    for j in range(3):
        cell = winners_table[(i, j)]
        cell.set_facecolor(colors_winners[i % 2])
        cell.set_text_props(color='#e6edf3', weight='bold' if j == 1 else 'normal')
        cell.set_edgecolor('#30363d')

plt.title('WINNERS BY METRIC', 
          fontsize=14, fontweight='bold', color='#e6edf3', pad=20)
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image