benchmarks: add v3.3.0 reproduction results + 50/450 split

Addresses #875: every internal BENCHMARKS.md claim reproduced
on Linux x86_64 (v3.3.0 tag, deterministic ChromaDB embeddings,
seed=42 for the LongMemEval dev/held-out split).

Scorecard — all reproduce exactly:

  LongMemEval
    raw R@5                            96.6% (500/500)   
    hybrid_v4 held-out 450 R@5         98.4% (442/450)   
    hybrid_v4 + minimax rerank R@5     99.2% (496/500)   *
    hybrid_v4 + minimax rerank R@10   100.0% (500/500)   *

  LoCoMo (session, top-10)
    raw                                60.3% (1986q)     
    hybrid v5                          88.9% (1986q)     

  ConvoMem all-categories (250 items)   92.9%            
  MemBench all-categories (8500)        80.3%            

* The minimax-m2.7:cloud rerank run replicates the "100%" claim
  with a different LLM family (no Anthropic dependency). R@10 is
  a perfect reproduction; R@5 misses 4 questions that the
  published Haiku run caught — consistent with BENCHMARKS.md's own
  disclosure that hybrid_v4 includes three question-specific fixes
  developed by inspecting misses, i.e. teaching to the test.

The committed 50/450 split is the deterministic (seed=42) split
BENCHMARKS.md references but wasn't previously in the repo.

Full result JSONLs include every question, every retrieved id,
and every score — auditable end-to-end.
This commit is contained in:
Igor Lins e Silva
2026-04-14 21:21:11 -03:00
parent ca0682abe3
commit 61d02e10fe
9 changed files with 331251 additions and 0 deletions
+508
View File
@@ -0,0 +1,508 @@
{
"dev": [
"cc06de0d",
"f9e8c073",
"b320f3f8",
"a89d7624",
"311778f1",
"gpt4_59c863d7",
"bbf86515",
"099778bb",
"e831120c",
"dcfa8644",
"8fb83627",
"e66b632c",
"gpt4_7fce9456",
"55241a1f",
"352ab8bd",
"f4f1d8a4",
"830ce83f",
"2311e44b",
"09ba9854",
"gpt4_a1b77f9c",
"07741c45",
"gpt4_70e84552",
"b46e15ee",
"6071bd76",
"6f9b354f",
"1d4da289",
"gpt4_8279ba02",
"6456829e_abs",
"0db4c65d",
"d6062bb9",
"60bf93ed_abs",
"d3ab962e",
"87f22b4a",
"e01b8e2f",
"gpt4_7ddcf75f",
"8ebdbe50",
"26bdc477",
"29f2956b_abs",
"2311e44b_abs",
"75f70248",
"852ce960",
"f0e564bc",
"fca70973",
"3c1045c8",
"18bc8abd",
"afdc33df",
"54026fce",
"b9cfe692",
"6456829e",
"e6041065"
],
"held_out": [
"gpt4_15e38248",
"gpt4_2ba83207",
"2133c1b5_abs",
"gpt4_8279ba03",
"76d63226",
"1192316e",
"gpt4_fa19884d",
"gpt4_372c3eed_abs",
"1a8a66a6",
"gpt4_fe651585",
"e25c3b8d",
"945e3d21",
"86b68151",
"1c0ddc50",
"1e043500",
"d682f1a2",
"gpt4_b5700ca0",
"91b15a6e",
"ce6d2d27",
"f523d9fe",
"7024f17c",
"8752c811",
"gpt4_f420262d",
"d01c6aa8",
"4b24c848",
"7e974930",
"3fdac837",
"gpt4_b4a80587",
"c18a7dc8",
"80ec1f4f_abs",
"7527f7e2",
"6ade9755",
"89941a94",
"gpt4_1d80365e",
"2133c1b5",
"06db6396",
"gpt4_88806d6e",
"88432d0a",
"3ba21379",
"0862e8bf",
"aae3761f",
"5025383b",
"gpt4_e061b84f",
"73d42213",
"4bc144e2",
"gpt4_5501fe77",
"00ca467f",
"dfde3500",
"01493427",
"b6025781",
"a96c20ee_abs",
"982b5123_abs",
"gpt4_fa19884c",
"gpt4_1a1dc16d",
"28dc39ac",
"gpt4_2d58bcd6",
"51c32626",
"c4ea545c",
"1da05512",
"gpt4_385a5000",
"577d4d32",
"72e3ee87",
"f4f1d8a4_abs",
"9d25d4e0",
"b29f3365",
"b759caee",
"10e09553",
"1d4e3b97",
"d52b4f67",
"gpt4_e072b769",
"58ef2f1c",
"6e984301",
"41275add",
"gpt4_59149c77",
"2ebe6c90",
"1cea1afa",
"gpt4_1e4a8aec",
"6c49646a",
"8a2466db",
"gpt4_65aabe59",
"gpt4_93159ced",
"51a45a95",
"af8d2e46",
"561fabcd",
"370a8ff4",
"gpt4_d84a3211",
"gpt4_7a0daae1",
"2a1811e2",
"gpt4_78cf46a3",
"1568498a",
"6b7dfb22",
"6ae235be",
"bc8a6e93_abs",
"681a1674",
"06878be2",
"1a1907b4",
"0e4e4c46",
"gpt4_85da3956",
"gpt4_f420262c",
"2bf43736",
"bc149d6b",
"09d032c9",
"5c40ec5b",
"eac54adc",
"993da5e2",
"71a3fd6b",
"gpt4_0b2f1d21",
"ad7109d1",
"4c36ccef",
"c8c3f81d",
"edced276_abs",
"0bc8ad92",
"gpt4_468eb064",
"2ebe6c92",
"cc6d1ec1",
"4dfccbf8",
"95228167",
"ba358f49",
"45dc21b6",
"db467c8c",
"720133ac",
"67e0d0f2",
"cc5ded98",
"726462e0",
"4100d0a0",
"3a704032",
"gpt4_7ca326fa",
"ec81a493",
"618f13b2",
"58470ed2",
"gpt4_4fc4f797",
"60036106",
"157a136e",
"6222b6eb",
"69fee5aa",
"19b5f2b3_abs",
"gpt4_d12ceb0e",
"51b23612",
"2318644b",
"3fe836c9",
"gpt4_7de946e7",
"71017277",
"f0853d11",
"dc439ea3",
"gpt4_2f91af09",
"9a707b81",
"bc8a6e93",
"c14c00dd",
"8979f9ec",
"cf22b7bf",
"gpt4_ec93e27f",
"gpt4_468eb063",
"41698283",
"1de5cff2",
"21d02d0d",
"c7cf7dfd",
"gpt4_ab202e7f",
"dccbc061",
"078150f1",
"e3038f8c",
"gpt4_c27434e8_abs",
"2698e78f",
"031748ae_abs",
"gpt4_59149c78",
"c8f1aeed",
"184da446",
"gpt4_b5700ca9",
"89527b6b",
"0977f2af",
"853b0a1d",
"a346bb18",
"3249768e",
"gpt4_2f8be40d",
"gpt4_93159ced_abs",
"eeda8a6d",
"7a8d0b71",
"95bcc1c8",
"gpt4_2487a7cb",
"85fa3a3f",
"7e00a6cb",
"e3fc4d6e",
"59524333",
"37f165cf",
"0ddfec37",
"60bf93ed",
"d7c942c3",
"80ec1f4f",
"ceb54acb",
"9aaed6a3",
"gpt4_4929293a",
"ed4ddc30",
"545bd2b5",
"2788b940",
"ef9cf60a",
"gpt4_7f6b06db",
"0ea62687",
"3d86fd0a",
"3e321797",
"d24813b1",
"38146c39",
"efc3f7c2",
"7401057b",
"5809eb10",
"28bcfaac",
"1903aded",
"gpt4_194be4b3",
"gpt4_e414231f",
"0ddfec37_abs",
"c2ac3c61",
"gpt4_4ef30696",
"1f2b8d4f",
"0f05491a",
"8550ddae",
"8077ef71",
"b86304ba",
"e61a7584",
"8cf51dda",
"gpt4_2f584639",
"08e075c7",
"5d3d2817",
"7405e8b1",
"a3045048",
"gpt4_731e37d7",
"c8090214_abs",
"36580ce8",
"ba358f49_abs",
"gpt4_d6585ce8",
"e56a43b9",
"2c63a862",
"gpt4_5438fa52",
"07b6f563",
"gpt4_31ff4165",
"0bb5a684",
"71315a70",
"gpt4_cd90e484",
"gpt4_8c8961ae",
"gpt4_fe651585_abs",
"36b9f61e",
"gpt4_b0863698",
"gpt4_1d4ab0c9",
"15745da0_abs",
"0862e8bf_abs",
"bcbe585f",
"a2f3aa27",
"gpt4_6dc9b45b",
"ccb36322",
"f685340e",
"9ea5eabc",
"gpt4_372c3eed",
"37d43f65",
"bf659f65",
"b0479f84",
"gpt4_213fd887",
"e4e14d04",
"f8c5f88b",
"gpt4_18c2b244",
"a11281a2",
"gpt4_2655b836",
"e47becba",
"gpt4_74aed68e",
"gpt4_af6db32f",
"6cb6f249",
"77eafa52",
"gpt4_93f6379c",
"e8a79c70",
"7a87bd0c",
"gpt4_6ed717ea",
"d6233ab6",
"c19f7a0b",
"gpt4_61e13b3c",
"d23cf73b",
"gpt4_1e4a8aeb",
"ba61f0b9",
"118b2229",
"488d3006",
"c4a1ceb8",
"8e91e7d9",
"42ec0761",
"65240037",
"fea54f57",
"c8090214",
"b01defab",
"6aeb4375_abs",
"faba32e5",
"c5e8278d",
"gpt4_e414231e",
"eeda8a6d_abs",
"gpt4_8e165409",
"af082822",
"22d2cb42",
"92a0aa75",
"1c549ce4",
"25e5aa4f",
"gpt4_68e94288",
"4baee567",
"18dcd5a5",
"dad224aa",
"gpt4_f2262a51",
"29f2956b",
"21436231",
"19b5f2b3",
"gpt4_1916e0ea",
"gpt4_45189cb4",
"0a995998",
"b6019101",
"9bbe84a2",
"61f8c8f8",
"9a707b82",
"8cf4d046",
"eac54add",
"75832dbd",
"gpt4_98f46fc6",
"d596882b",
"88432d0a_abs",
"16c90bf4",
"f685340e_abs",
"b5ef892d",
"gpt4_f49edff3",
"gpt4_483dd43c",
"bb7c3b45",
"gpt4_7abb270c",
"gpt4_9a159967",
"07741c44",
"4d6b87c8",
"6aeb4375",
"gpt4_d6585ce9",
"60472f9c",
"caf9ead2",
"32260d93",
"60159905",
"0a34ad58",
"a40e080f",
"10d9b85a",
"a06e4cfe",
"4f54b7c9",
"6613b389",
"70b3e69b",
"gpt4_7bc6cf22",
"gpt4_0a05b494",
"778164c6",
"195a1a1b",
"8464fc84",
"b46e15ed",
"603deb26",
"eaca4986",
"2698e78f_abs",
"gpt4_21adecb5",
"2e6d26dc",
"5831f84d",
"08f4fc43",
"3f1e9474",
"c9f37c46",
"gpt4_2f56ae70",
"1b9b7252",
"35a27287",
"gpt4_d31cdae3",
"129d1232",
"4adc0475",
"27016adc",
"46a3abf7",
"9ee3ecd6",
"982b5123",
"09ba9854_abs",
"0e5e2d1a",
"e9327a54",
"86f00804",
"e982271f",
"7161e7e2",
"57f827a0",
"6a27ffc2",
"edced276",
"gpt4_d9af6064",
"75499fd8",
"60d45044",
"gpt4_70e84552_abs",
"2ce6a0f2",
"gpt4_4929293b",
"a1cc6108",
"gpt4_5dcc0aab",
"a3838d2b",
"c7dc5443",
"505af2f5",
"gpt4_68e94287",
"15745da0",
"0100672e",
"a82c026e",
"5e1b23de",
"71017276",
"89941a93",
"6b168ec8",
"affe2881",
"0edc2aef",
"gpt4_2312f94c",
"a4996e51",
"c6853660",
"ef66a6e5",
"8a137a7f",
"a96c20ee",
"fca762bc",
"ac031881",
"d905b33f",
"e493bb7c",
"a9f6b44c",
"dd2973ad",
"8aef76bc",
"f35224e0",
"8b9d4367",
"gpt4_c27434e8",
"gpt4_a56e767c",
"eace081b",
"5a4f22c0",
"58bf7951",
"c4f10528",
"50635ada",
"06f04340",
"0bc8ad93",
"e5ba910e_abs",
"5a7937c8",
"a3332713",
"4388e9dd",
"8c18457d",
"gpt4_2c50253f",
"6a1eabeb",
"b3c15d39",
"gpt4_e061b84g",
"3b6f954b",
"gpt4_76048e76",
"4dfccbf7",
"2b8f3739",
"d851d5ba",
"4fd1909e",
"94f70d80",
"66f24dbb",
"a08a253f",
"6e984302",
"001be529",
"gpt4_a2d1d1f6",
"cc539528",
"e48988bc",
"gpt4_4cd9eba1",
"8e9d538c",
"a1eacc2a",
"6d550036",
"gpt4_e05b82a6",
"81507db6",
"caf03d32",
"031748ae",
"c960da58",
"1faac195",
"gpt4_4edbafa2"
],
"seed": 42,
"dev_size": 50
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long