Merge pull request #895 from MemPalace/bench/v3.3.0-verification
benchmarks: v3.3.0 reproduction results + Ollama rerank backend
This commit is contained in:
@@ -0,0 +1,508 @@
|
|||||||
|
{
|
||||||
|
"dev": [
|
||||||
|
"cc06de0d",
|
||||||
|
"f9e8c073",
|
||||||
|
"b320f3f8",
|
||||||
|
"a89d7624",
|
||||||
|
"311778f1",
|
||||||
|
"gpt4_59c863d7",
|
||||||
|
"bbf86515",
|
||||||
|
"099778bb",
|
||||||
|
"e831120c",
|
||||||
|
"dcfa8644",
|
||||||
|
"8fb83627",
|
||||||
|
"e66b632c",
|
||||||
|
"gpt4_7fce9456",
|
||||||
|
"55241a1f",
|
||||||
|
"352ab8bd",
|
||||||
|
"f4f1d8a4",
|
||||||
|
"830ce83f",
|
||||||
|
"2311e44b",
|
||||||
|
"09ba9854",
|
||||||
|
"gpt4_a1b77f9c",
|
||||||
|
"07741c45",
|
||||||
|
"gpt4_70e84552",
|
||||||
|
"b46e15ee",
|
||||||
|
"6071bd76",
|
||||||
|
"6f9b354f",
|
||||||
|
"1d4da289",
|
||||||
|
"gpt4_8279ba02",
|
||||||
|
"6456829e_abs",
|
||||||
|
"0db4c65d",
|
||||||
|
"d6062bb9",
|
||||||
|
"60bf93ed_abs",
|
||||||
|
"d3ab962e",
|
||||||
|
"87f22b4a",
|
||||||
|
"e01b8e2f",
|
||||||
|
"gpt4_7ddcf75f",
|
||||||
|
"8ebdbe50",
|
||||||
|
"26bdc477",
|
||||||
|
"29f2956b_abs",
|
||||||
|
"2311e44b_abs",
|
||||||
|
"75f70248",
|
||||||
|
"852ce960",
|
||||||
|
"f0e564bc",
|
||||||
|
"fca70973",
|
||||||
|
"3c1045c8",
|
||||||
|
"18bc8abd",
|
||||||
|
"afdc33df",
|
||||||
|
"54026fce",
|
||||||
|
"b9cfe692",
|
||||||
|
"6456829e",
|
||||||
|
"e6041065"
|
||||||
|
],
|
||||||
|
"held_out": [
|
||||||
|
"gpt4_15e38248",
|
||||||
|
"gpt4_2ba83207",
|
||||||
|
"2133c1b5_abs",
|
||||||
|
"gpt4_8279ba03",
|
||||||
|
"76d63226",
|
||||||
|
"1192316e",
|
||||||
|
"gpt4_fa19884d",
|
||||||
|
"gpt4_372c3eed_abs",
|
||||||
|
"1a8a66a6",
|
||||||
|
"gpt4_fe651585",
|
||||||
|
"e25c3b8d",
|
||||||
|
"945e3d21",
|
||||||
|
"86b68151",
|
||||||
|
"1c0ddc50",
|
||||||
|
"1e043500",
|
||||||
|
"d682f1a2",
|
||||||
|
"gpt4_b5700ca0",
|
||||||
|
"91b15a6e",
|
||||||
|
"ce6d2d27",
|
||||||
|
"f523d9fe",
|
||||||
|
"7024f17c",
|
||||||
|
"8752c811",
|
||||||
|
"gpt4_f420262d",
|
||||||
|
"d01c6aa8",
|
||||||
|
"4b24c848",
|
||||||
|
"7e974930",
|
||||||
|
"3fdac837",
|
||||||
|
"gpt4_b4a80587",
|
||||||
|
"c18a7dc8",
|
||||||
|
"80ec1f4f_abs",
|
||||||
|
"7527f7e2",
|
||||||
|
"6ade9755",
|
||||||
|
"89941a94",
|
||||||
|
"gpt4_1d80365e",
|
||||||
|
"2133c1b5",
|
||||||
|
"06db6396",
|
||||||
|
"gpt4_88806d6e",
|
||||||
|
"88432d0a",
|
||||||
|
"3ba21379",
|
||||||
|
"0862e8bf",
|
||||||
|
"aae3761f",
|
||||||
|
"5025383b",
|
||||||
|
"gpt4_e061b84f",
|
||||||
|
"73d42213",
|
||||||
|
"4bc144e2",
|
||||||
|
"gpt4_5501fe77",
|
||||||
|
"00ca467f",
|
||||||
|
"dfde3500",
|
||||||
|
"01493427",
|
||||||
|
"b6025781",
|
||||||
|
"a96c20ee_abs",
|
||||||
|
"982b5123_abs",
|
||||||
|
"gpt4_fa19884c",
|
||||||
|
"gpt4_1a1dc16d",
|
||||||
|
"28dc39ac",
|
||||||
|
"gpt4_2d58bcd6",
|
||||||
|
"51c32626",
|
||||||
|
"c4ea545c",
|
||||||
|
"1da05512",
|
||||||
|
"gpt4_385a5000",
|
||||||
|
"577d4d32",
|
||||||
|
"72e3ee87",
|
||||||
|
"f4f1d8a4_abs",
|
||||||
|
"9d25d4e0",
|
||||||
|
"b29f3365",
|
||||||
|
"b759caee",
|
||||||
|
"10e09553",
|
||||||
|
"1d4e3b97",
|
||||||
|
"d52b4f67",
|
||||||
|
"gpt4_e072b769",
|
||||||
|
"58ef2f1c",
|
||||||
|
"6e984301",
|
||||||
|
"41275add",
|
||||||
|
"gpt4_59149c77",
|
||||||
|
"2ebe6c90",
|
||||||
|
"1cea1afa",
|
||||||
|
"gpt4_1e4a8aec",
|
||||||
|
"6c49646a",
|
||||||
|
"8a2466db",
|
||||||
|
"gpt4_65aabe59",
|
||||||
|
"gpt4_93159ced",
|
||||||
|
"51a45a95",
|
||||||
|
"af8d2e46",
|
||||||
|
"561fabcd",
|
||||||
|
"370a8ff4",
|
||||||
|
"gpt4_d84a3211",
|
||||||
|
"gpt4_7a0daae1",
|
||||||
|
"2a1811e2",
|
||||||
|
"gpt4_78cf46a3",
|
||||||
|
"1568498a",
|
||||||
|
"6b7dfb22",
|
||||||
|
"6ae235be",
|
||||||
|
"bc8a6e93_abs",
|
||||||
|
"681a1674",
|
||||||
|
"06878be2",
|
||||||
|
"1a1907b4",
|
||||||
|
"0e4e4c46",
|
||||||
|
"gpt4_85da3956",
|
||||||
|
"gpt4_f420262c",
|
||||||
|
"2bf43736",
|
||||||
|
"bc149d6b",
|
||||||
|
"09d032c9",
|
||||||
|
"5c40ec5b",
|
||||||
|
"eac54adc",
|
||||||
|
"993da5e2",
|
||||||
|
"71a3fd6b",
|
||||||
|
"gpt4_0b2f1d21",
|
||||||
|
"ad7109d1",
|
||||||
|
"4c36ccef",
|
||||||
|
"c8c3f81d",
|
||||||
|
"edced276_abs",
|
||||||
|
"0bc8ad92",
|
||||||
|
"gpt4_468eb064",
|
||||||
|
"2ebe6c92",
|
||||||
|
"cc6d1ec1",
|
||||||
|
"4dfccbf8",
|
||||||
|
"95228167",
|
||||||
|
"ba358f49",
|
||||||
|
"45dc21b6",
|
||||||
|
"db467c8c",
|
||||||
|
"720133ac",
|
||||||
|
"67e0d0f2",
|
||||||
|
"cc5ded98",
|
||||||
|
"726462e0",
|
||||||
|
"4100d0a0",
|
||||||
|
"3a704032",
|
||||||
|
"gpt4_7ca326fa",
|
||||||
|
"ec81a493",
|
||||||
|
"618f13b2",
|
||||||
|
"58470ed2",
|
||||||
|
"gpt4_4fc4f797",
|
||||||
|
"60036106",
|
||||||
|
"157a136e",
|
||||||
|
"6222b6eb",
|
||||||
|
"69fee5aa",
|
||||||
|
"19b5f2b3_abs",
|
||||||
|
"gpt4_d12ceb0e",
|
||||||
|
"51b23612",
|
||||||
|
"2318644b",
|
||||||
|
"3fe836c9",
|
||||||
|
"gpt4_7de946e7",
|
||||||
|
"71017277",
|
||||||
|
"f0853d11",
|
||||||
|
"dc439ea3",
|
||||||
|
"gpt4_2f91af09",
|
||||||
|
"9a707b81",
|
||||||
|
"bc8a6e93",
|
||||||
|
"c14c00dd",
|
||||||
|
"8979f9ec",
|
||||||
|
"cf22b7bf",
|
||||||
|
"gpt4_ec93e27f",
|
||||||
|
"gpt4_468eb063",
|
||||||
|
"41698283",
|
||||||
|
"1de5cff2",
|
||||||
|
"21d02d0d",
|
||||||
|
"c7cf7dfd",
|
||||||
|
"gpt4_ab202e7f",
|
||||||
|
"dccbc061",
|
||||||
|
"078150f1",
|
||||||
|
"e3038f8c",
|
||||||
|
"gpt4_c27434e8_abs",
|
||||||
|
"2698e78f",
|
||||||
|
"031748ae_abs",
|
||||||
|
"gpt4_59149c78",
|
||||||
|
"c8f1aeed",
|
||||||
|
"184da446",
|
||||||
|
"gpt4_b5700ca9",
|
||||||
|
"89527b6b",
|
||||||
|
"0977f2af",
|
||||||
|
"853b0a1d",
|
||||||
|
"a346bb18",
|
||||||
|
"3249768e",
|
||||||
|
"gpt4_2f8be40d",
|
||||||
|
"gpt4_93159ced_abs",
|
||||||
|
"eeda8a6d",
|
||||||
|
"7a8d0b71",
|
||||||
|
"95bcc1c8",
|
||||||
|
"gpt4_2487a7cb",
|
||||||
|
"85fa3a3f",
|
||||||
|
"7e00a6cb",
|
||||||
|
"e3fc4d6e",
|
||||||
|
"59524333",
|
||||||
|
"37f165cf",
|
||||||
|
"0ddfec37",
|
||||||
|
"60bf93ed",
|
||||||
|
"d7c942c3",
|
||||||
|
"80ec1f4f",
|
||||||
|
"ceb54acb",
|
||||||
|
"9aaed6a3",
|
||||||
|
"gpt4_4929293a",
|
||||||
|
"ed4ddc30",
|
||||||
|
"545bd2b5",
|
||||||
|
"2788b940",
|
||||||
|
"ef9cf60a",
|
||||||
|
"gpt4_7f6b06db",
|
||||||
|
"0ea62687",
|
||||||
|
"3d86fd0a",
|
||||||
|
"3e321797",
|
||||||
|
"d24813b1",
|
||||||
|
"38146c39",
|
||||||
|
"efc3f7c2",
|
||||||
|
"7401057b",
|
||||||
|
"5809eb10",
|
||||||
|
"28bcfaac",
|
||||||
|
"1903aded",
|
||||||
|
"gpt4_194be4b3",
|
||||||
|
"gpt4_e414231f",
|
||||||
|
"0ddfec37_abs",
|
||||||
|
"c2ac3c61",
|
||||||
|
"gpt4_4ef30696",
|
||||||
|
"1f2b8d4f",
|
||||||
|
"0f05491a",
|
||||||
|
"8550ddae",
|
||||||
|
"8077ef71",
|
||||||
|
"b86304ba",
|
||||||
|
"e61a7584",
|
||||||
|
"8cf51dda",
|
||||||
|
"gpt4_2f584639",
|
||||||
|
"08e075c7",
|
||||||
|
"5d3d2817",
|
||||||
|
"7405e8b1",
|
||||||
|
"a3045048",
|
||||||
|
"gpt4_731e37d7",
|
||||||
|
"c8090214_abs",
|
||||||
|
"36580ce8",
|
||||||
|
"ba358f49_abs",
|
||||||
|
"gpt4_d6585ce8",
|
||||||
|
"e56a43b9",
|
||||||
|
"2c63a862",
|
||||||
|
"gpt4_5438fa52",
|
||||||
|
"07b6f563",
|
||||||
|
"gpt4_31ff4165",
|
||||||
|
"0bb5a684",
|
||||||
|
"71315a70",
|
||||||
|
"gpt4_cd90e484",
|
||||||
|
"gpt4_8c8961ae",
|
||||||
|
"gpt4_fe651585_abs",
|
||||||
|
"36b9f61e",
|
||||||
|
"gpt4_b0863698",
|
||||||
|
"gpt4_1d4ab0c9",
|
||||||
|
"15745da0_abs",
|
||||||
|
"0862e8bf_abs",
|
||||||
|
"bcbe585f",
|
||||||
|
"a2f3aa27",
|
||||||
|
"gpt4_6dc9b45b",
|
||||||
|
"ccb36322",
|
||||||
|
"f685340e",
|
||||||
|
"9ea5eabc",
|
||||||
|
"gpt4_372c3eed",
|
||||||
|
"37d43f65",
|
||||||
|
"bf659f65",
|
||||||
|
"b0479f84",
|
||||||
|
"gpt4_213fd887",
|
||||||
|
"e4e14d04",
|
||||||
|
"f8c5f88b",
|
||||||
|
"gpt4_18c2b244",
|
||||||
|
"a11281a2",
|
||||||
|
"gpt4_2655b836",
|
||||||
|
"e47becba",
|
||||||
|
"gpt4_74aed68e",
|
||||||
|
"gpt4_af6db32f",
|
||||||
|
"6cb6f249",
|
||||||
|
"77eafa52",
|
||||||
|
"gpt4_93f6379c",
|
||||||
|
"e8a79c70",
|
||||||
|
"7a87bd0c",
|
||||||
|
"gpt4_6ed717ea",
|
||||||
|
"d6233ab6",
|
||||||
|
"c19f7a0b",
|
||||||
|
"gpt4_61e13b3c",
|
||||||
|
"d23cf73b",
|
||||||
|
"gpt4_1e4a8aeb",
|
||||||
|
"ba61f0b9",
|
||||||
|
"118b2229",
|
||||||
|
"488d3006",
|
||||||
|
"c4a1ceb8",
|
||||||
|
"8e91e7d9",
|
||||||
|
"42ec0761",
|
||||||
|
"65240037",
|
||||||
|
"fea54f57",
|
||||||
|
"c8090214",
|
||||||
|
"b01defab",
|
||||||
|
"6aeb4375_abs",
|
||||||
|
"faba32e5",
|
||||||
|
"c5e8278d",
|
||||||
|
"gpt4_e414231e",
|
||||||
|
"eeda8a6d_abs",
|
||||||
|
"gpt4_8e165409",
|
||||||
|
"af082822",
|
||||||
|
"22d2cb42",
|
||||||
|
"92a0aa75",
|
||||||
|
"1c549ce4",
|
||||||
|
"25e5aa4f",
|
||||||
|
"gpt4_68e94288",
|
||||||
|
"4baee567",
|
||||||
|
"18dcd5a5",
|
||||||
|
"dad224aa",
|
||||||
|
"gpt4_f2262a51",
|
||||||
|
"29f2956b",
|
||||||
|
"21436231",
|
||||||
|
"19b5f2b3",
|
||||||
|
"gpt4_1916e0ea",
|
||||||
|
"gpt4_45189cb4",
|
||||||
|
"0a995998",
|
||||||
|
"b6019101",
|
||||||
|
"9bbe84a2",
|
||||||
|
"61f8c8f8",
|
||||||
|
"9a707b82",
|
||||||
|
"8cf4d046",
|
||||||
|
"eac54add",
|
||||||
|
"75832dbd",
|
||||||
|
"gpt4_98f46fc6",
|
||||||
|
"d596882b",
|
||||||
|
"88432d0a_abs",
|
||||||
|
"16c90bf4",
|
||||||
|
"f685340e_abs",
|
||||||
|
"b5ef892d",
|
||||||
|
"gpt4_f49edff3",
|
||||||
|
"gpt4_483dd43c",
|
||||||
|
"bb7c3b45",
|
||||||
|
"gpt4_7abb270c",
|
||||||
|
"gpt4_9a159967",
|
||||||
|
"07741c44",
|
||||||
|
"4d6b87c8",
|
||||||
|
"6aeb4375",
|
||||||
|
"gpt4_d6585ce9",
|
||||||
|
"60472f9c",
|
||||||
|
"caf9ead2",
|
||||||
|
"32260d93",
|
||||||
|
"60159905",
|
||||||
|
"0a34ad58",
|
||||||
|
"a40e080f",
|
||||||
|
"10d9b85a",
|
||||||
|
"a06e4cfe",
|
||||||
|
"4f54b7c9",
|
||||||
|
"6613b389",
|
||||||
|
"70b3e69b",
|
||||||
|
"gpt4_7bc6cf22",
|
||||||
|
"gpt4_0a05b494",
|
||||||
|
"778164c6",
|
||||||
|
"195a1a1b",
|
||||||
|
"8464fc84",
|
||||||
|
"b46e15ed",
|
||||||
|
"603deb26",
|
||||||
|
"eaca4986",
|
||||||
|
"2698e78f_abs",
|
||||||
|
"gpt4_21adecb5",
|
||||||
|
"2e6d26dc",
|
||||||
|
"5831f84d",
|
||||||
|
"08f4fc43",
|
||||||
|
"3f1e9474",
|
||||||
|
"c9f37c46",
|
||||||
|
"gpt4_2f56ae70",
|
||||||
|
"1b9b7252",
|
||||||
|
"35a27287",
|
||||||
|
"gpt4_d31cdae3",
|
||||||
|
"129d1232",
|
||||||
|
"4adc0475",
|
||||||
|
"27016adc",
|
||||||
|
"46a3abf7",
|
||||||
|
"9ee3ecd6",
|
||||||
|
"982b5123",
|
||||||
|
"09ba9854_abs",
|
||||||
|
"0e5e2d1a",
|
||||||
|
"e9327a54",
|
||||||
|
"86f00804",
|
||||||
|
"e982271f",
|
||||||
|
"7161e7e2",
|
||||||
|
"57f827a0",
|
||||||
|
"6a27ffc2",
|
||||||
|
"edced276",
|
||||||
|
"gpt4_d9af6064",
|
||||||
|
"75499fd8",
|
||||||
|
"60d45044",
|
||||||
|
"gpt4_70e84552_abs",
|
||||||
|
"2ce6a0f2",
|
||||||
|
"gpt4_4929293b",
|
||||||
|
"a1cc6108",
|
||||||
|
"gpt4_5dcc0aab",
|
||||||
|
"a3838d2b",
|
||||||
|
"c7dc5443",
|
||||||
|
"505af2f5",
|
||||||
|
"gpt4_68e94287",
|
||||||
|
"15745da0",
|
||||||
|
"0100672e",
|
||||||
|
"a82c026e",
|
||||||
|
"5e1b23de",
|
||||||
|
"71017276",
|
||||||
|
"89941a93",
|
||||||
|
"6b168ec8",
|
||||||
|
"affe2881",
|
||||||
|
"0edc2aef",
|
||||||
|
"gpt4_2312f94c",
|
||||||
|
"a4996e51",
|
||||||
|
"c6853660",
|
||||||
|
"ef66a6e5",
|
||||||
|
"8a137a7f",
|
||||||
|
"a96c20ee",
|
||||||
|
"fca762bc",
|
||||||
|
"ac031881",
|
||||||
|
"d905b33f",
|
||||||
|
"e493bb7c",
|
||||||
|
"a9f6b44c",
|
||||||
|
"dd2973ad",
|
||||||
|
"8aef76bc",
|
||||||
|
"f35224e0",
|
||||||
|
"8b9d4367",
|
||||||
|
"gpt4_c27434e8",
|
||||||
|
"gpt4_a56e767c",
|
||||||
|
"eace081b",
|
||||||
|
"5a4f22c0",
|
||||||
|
"58bf7951",
|
||||||
|
"c4f10528",
|
||||||
|
"50635ada",
|
||||||
|
"06f04340",
|
||||||
|
"0bc8ad93",
|
||||||
|
"e5ba910e_abs",
|
||||||
|
"5a7937c8",
|
||||||
|
"a3332713",
|
||||||
|
"4388e9dd",
|
||||||
|
"8c18457d",
|
||||||
|
"gpt4_2c50253f",
|
||||||
|
"6a1eabeb",
|
||||||
|
"b3c15d39",
|
||||||
|
"gpt4_e061b84g",
|
||||||
|
"3b6f954b",
|
||||||
|
"gpt4_76048e76",
|
||||||
|
"4dfccbf7",
|
||||||
|
"2b8f3739",
|
||||||
|
"d851d5ba",
|
||||||
|
"4fd1909e",
|
||||||
|
"94f70d80",
|
||||||
|
"66f24dbb",
|
||||||
|
"a08a253f",
|
||||||
|
"6e984302",
|
||||||
|
"001be529",
|
||||||
|
"gpt4_a2d1d1f6",
|
||||||
|
"cc539528",
|
||||||
|
"e48988bc",
|
||||||
|
"gpt4_4cd9eba1",
|
||||||
|
"8e9d538c",
|
||||||
|
"a1eacc2a",
|
||||||
|
"6d550036",
|
||||||
|
"gpt4_e05b82a6",
|
||||||
|
"81507db6",
|
||||||
|
"caf03d32",
|
||||||
|
"031748ae",
|
||||||
|
"c960da58",
|
||||||
|
"1faac195",
|
||||||
|
"gpt4_4edbafa2"
|
||||||
|
],
|
||||||
|
"seed": 42,
|
||||||
|
"dev_size": 50
|
||||||
|
}
|
||||||
+60
-14
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku
|
|||||||
|
|
||||||
|
|
||||||
def llm_rerank_locomo(
|
def llm_rerank_locomo(
|
||||||
question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
|
question,
|
||||||
|
retrieved_ids,
|
||||||
|
retrieved_docs,
|
||||||
|
api_key,
|
||||||
|
top_k=10,
|
||||||
|
model="claude-sonnet-4-6",
|
||||||
|
backend="anthropic",
|
||||||
|
base_url="",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Ask LLM to pick the single most relevant document for this question.
|
Ask LLM to pick the single most relevant document for this question.
|
||||||
Returns reordered retrieved_ids with the best candidate first.
|
Returns reordered retrieved_ids with the best candidate first.
|
||||||
|
|
||||||
|
Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
|
||||||
"""
|
"""
|
||||||
candidates = retrieved_ids[:top_k]
|
candidates = retrieved_ids[:top_k]
|
||||||
candidate_docs = retrieved_docs[:top_k]
|
candidate_docs = retrieved_docs[:top_k]
|
||||||
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
|
|||||||
if len(candidates) <= 1:
|
if len(candidates) <= 1:
|
||||||
return retrieved_ids
|
return retrieved_ids
|
||||||
|
|
||||||
# Build numbered list of candidates
|
|
||||||
lines = []
|
lines = []
|
||||||
for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
|
for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
|
||||||
snippet = doc[:300].replace("\n", " ")
|
snippet = doc[:300].replace("\n", " ")
|
||||||
@@ -534,6 +542,21 @@ def llm_rerank_locomo(
|
|||||||
f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
|
f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if backend == "ollama":
|
||||||
|
url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
|
||||||
|
payload = json.dumps(
|
||||||
|
{
|
||||||
|
"model": model,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"temperature": 0.0,
|
||||||
|
}
|
||||||
|
).encode("utf-8")
|
||||||
|
headers = {"content-type": "application/json"}
|
||||||
|
if api_key:
|
||||||
|
headers["authorization"] = f"Bearer {api_key}"
|
||||||
|
else:
|
||||||
|
url = "https://api.anthropic.com/v1/messages"
|
||||||
payload = json.dumps(
|
payload = json.dumps(
|
||||||
{
|
{
|
||||||
"model": model,
|
"model": model,
|
||||||
@@ -541,28 +564,29 @@ def llm_rerank_locomo(
|
|||||||
"messages": [{"role": "user", "content": prompt}],
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
}
|
}
|
||||||
).encode("utf-8")
|
).encode("utf-8")
|
||||||
|
|
||||||
req = urllib.request.Request(
|
|
||||||
"https://api.anthropic.com/v1/messages",
|
|
||||||
data=payload,
|
|
||||||
headers = {
|
headers = {
|
||||||
"x-api-key": api_key,
|
"x-api-key": api_key,
|
||||||
"anthropic-version": "2023-06-01",
|
"anthropic-version": "2023-06-01",
|
||||||
"content-type": "application/json",
|
"content-type": "application/json",
|
||||||
},
|
}
|
||||||
method="POST",
|
|
||||||
)
|
req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
|
||||||
|
|
||||||
import socket as _socket
|
import socket as _socket
|
||||||
|
|
||||||
for _attempt in range(3):
|
for _attempt in range(3):
|
||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
|
||||||
result = json.loads(resp.read())
|
result = json.loads(resp.read())
|
||||||
|
if backend == "ollama":
|
||||||
|
msg = result["choices"][0]["message"]
|
||||||
|
raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
|
||||||
|
else:
|
||||||
raw = result["content"][0]["text"].strip()
|
raw = result["content"][0]["text"].strip()
|
||||||
m = re.search(r"\b(\d+)\b", raw)
|
# Take LAST integer — reasoning models often count candidates first
|
||||||
|
m = re.search(r"\b(\d+)\b", raw[::-1])
|
||||||
if m:
|
if m:
|
||||||
pick = int(m.group(1))
|
pick = int(m.group(1)[::-1])
|
||||||
if 1 <= pick <= len(candidates):
|
if 1 <= pick <= len(candidates):
|
||||||
chosen_id = candidates[pick - 1]
|
chosen_id = candidates[pick - 1]
|
||||||
reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
|
reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
|
||||||
@@ -608,6 +632,8 @@ def run_benchmark(
|
|||||||
palace_cache_file=None,
|
palace_cache_file=None,
|
||||||
palace_model="claude-haiku-4-5-20251001",
|
palace_model="claude-haiku-4-5-20251001",
|
||||||
embed_model="default",
|
embed_model="default",
|
||||||
|
llm_backend="anthropic",
|
||||||
|
llm_base_url="",
|
||||||
):
|
):
|
||||||
"""Run LoCoMo retrieval benchmark."""
|
"""Run LoCoMo retrieval benchmark."""
|
||||||
with open(data_file) as f:
|
with open(data_file) as f:
|
||||||
@@ -619,8 +645,12 @@ def run_benchmark(
|
|||||||
api_key = ""
|
api_key = ""
|
||||||
if llm_rerank_enabled or mode == "palace":
|
if llm_rerank_enabled or mode == "palace":
|
||||||
api_key = _load_api_key(llm_key)
|
api_key = _load_api_key(llm_key)
|
||||||
if not api_key:
|
# Ollama backend doesn't require an Anthropic key. Palace mode still does
|
||||||
print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
|
# (it uses Anthropic for room-assignment indexing) — so only relax the
|
||||||
|
# requirement when rerank is the ONLY llm use and backend is ollama.
|
||||||
|
needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
|
||||||
|
if needs_key and not api_key:
|
||||||
|
print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Palace mode: load or create room assignment cache
|
# Palace mode: load or create room assignment cache
|
||||||
@@ -888,6 +918,8 @@ def run_benchmark(
|
|||||||
api_key,
|
api_key,
|
||||||
top_k=rerank_pool,
|
top_k=rerank_pool,
|
||||||
model=llm_model,
|
model=llm_model,
|
||||||
|
backend=llm_backend,
|
||||||
|
base_url=llm_base_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Compute recall
|
# Compute recall
|
||||||
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
|
|||||||
help="Model for LLM rerank (default: claude-sonnet-4-6)",
|
help="Model for LLM rerank (default: claude-sonnet-4-6)",
|
||||||
)
|
)
|
||||||
parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
|
parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--llm-backend",
|
||||||
|
choices=["anthropic", "ollama"],
|
||||||
|
default="anthropic",
|
||||||
|
help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
|
||||||
|
"(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--llm-base-url",
|
||||||
|
default="",
|
||||||
|
help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--hybrid-weight",
|
"--hybrid-weight",
|
||||||
type=float,
|
type=float,
|
||||||
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
|
|||||||
palace_cache_file=args.palace_cache,
|
palace_cache_file=args.palace_cache,
|
||||||
palace_model=args.palace_model,
|
palace_model=args.palace_model,
|
||||||
embed_model=args.embed_model,
|
embed_model=args.embed_model,
|
||||||
|
llm_backend=args.llm_backend,
|
||||||
|
llm_base_url=args.llm_base_url,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary(
|
|||||||
|
|
||||||
|
|
||||||
def llm_rerank(
|
def llm_rerank(
|
||||||
question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001"
|
question,
|
||||||
|
rankings,
|
||||||
|
corpus,
|
||||||
|
corpus_ids,
|
||||||
|
api_key,
|
||||||
|
top_k=10,
|
||||||
|
model="claude-haiku-4-5-20251001",
|
||||||
|
backend="anthropic",
|
||||||
|
base_url="",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Use an LLM to re-rank the top-k retrieved sessions.
|
Use an LLM to re-rank the top-k retrieved sessions.
|
||||||
@@ -2772,19 +2780,22 @@ def llm_rerank(
|
|||||||
which single session is most relevant to the question. That session
|
which single session is most relevant to the question. That session
|
||||||
is promoted to rank 1; the rest stay in their existing order.
|
is promoted to rank 1; the rest stay in their existing order.
|
||||||
|
|
||||||
This closes the gap for "preference" and jargon-dense "assistant"
|
Supports two backends:
|
||||||
failures where the right session is in top-10 semantically but not
|
- "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key.
|
||||||
top-5 — because the semantic gap (battery life ↔ phone hardware) is
|
- "ollama": hits {base_url}/v1/chat/completions (OpenAI-compat) —
|
||||||
too large for embeddings to bridge.
|
works for local Ollama (default http://localhost:11434)
|
||||||
|
and Ollama Cloud (:cloud model tags).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
question: The benchmark question string
|
question: The benchmark question string
|
||||||
rankings: Current ranked list of corpus indices (from any mode)
|
rankings: Current ranked list of corpus indices (from any mode)
|
||||||
corpus: List of document strings
|
corpus: List of document strings
|
||||||
corpus_ids: List of corpus IDs (parallel to corpus)
|
corpus_ids: List of corpus IDs (parallel to corpus)
|
||||||
api_key: Anthropic API key string
|
api_key: Anthropic API key (only required for backend="anthropic")
|
||||||
top_k: How many top sessions to send to LLM (default: 10)
|
top_k: How many top sessions to send to LLM (default: 10)
|
||||||
model: Claude model ID for reranking (default: haiku)
|
model: Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama)
|
||||||
|
backend: "anthropic" or "ollama"
|
||||||
|
base_url: Override base URL (ollama default: http://localhost:11434)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Reordered rankings list with LLM's best pick promoted to rank 1.
|
Reordered rankings list with LLM's best pick promoted to rank 1.
|
||||||
@@ -2796,7 +2807,6 @@ def llm_rerank(
|
|||||||
if not candidates:
|
if not candidates:
|
||||||
return rankings
|
return rankings
|
||||||
|
|
||||||
# Format sessions for the prompt — first 500 chars each, labelled 1..N
|
|
||||||
session_blocks = []
|
session_blocks = []
|
||||||
for rank, idx in enumerate(candidates):
|
for rank, idx in enumerate(candidates):
|
||||||
text = corpus[idx][:500].replace("\n", " ").strip()
|
text = corpus[idx][:500].replace("\n", " ").strip()
|
||||||
@@ -2813,6 +2823,21 @@ def llm_rerank(
|
|||||||
f"Most relevant session number:"
|
f"Most relevant session number:"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if backend == "ollama":
|
||||||
|
url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
|
||||||
|
payload = json.dumps(
|
||||||
|
{
|
||||||
|
"model": model,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"temperature": 0.0,
|
||||||
|
}
|
||||||
|
).encode("utf-8")
|
||||||
|
headers = {"content-type": "application/json"}
|
||||||
|
if api_key:
|
||||||
|
headers["authorization"] = f"Bearer {api_key}"
|
||||||
|
else:
|
||||||
|
url = "https://api.anthropic.com/v1/messages"
|
||||||
payload = json.dumps(
|
payload = json.dumps(
|
||||||
{
|
{
|
||||||
"model": model,
|
"model": model,
|
||||||
@@ -2820,42 +2845,46 @@ def llm_rerank(
|
|||||||
"messages": [{"role": "user", "content": prompt}],
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
}
|
}
|
||||||
).encode("utf-8")
|
).encode("utf-8")
|
||||||
|
|
||||||
req = urllib.request.Request(
|
|
||||||
"https://api.anthropic.com/v1/messages",
|
|
||||||
data=payload,
|
|
||||||
headers = {
|
headers = {
|
||||||
"x-api-key": api_key,
|
"x-api-key": api_key,
|
||||||
"anthropic-version": "2023-06-01",
|
"anthropic-version": "2023-06-01",
|
||||||
"content-type": "application/json",
|
"content-type": "application/json",
|
||||||
},
|
}
|
||||||
method="POST",
|
|
||||||
)
|
req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
|
||||||
|
|
||||||
import socket as _socket
|
import socket as _socket
|
||||||
|
|
||||||
for _attempt in range(3):
|
for _attempt in range(3):
|
||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(req, timeout=20) as resp:
|
with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp:
|
||||||
result = json.loads(resp.read())
|
result = json.loads(resp.read())
|
||||||
|
if backend == "ollama":
|
||||||
|
msg = result["choices"][0]["message"]
|
||||||
|
# Reasoning models (e.g. minimax-m2.7) may emit final answer in "content"
|
||||||
|
# or embed it in "reasoning". Try content first, fall back to reasoning.
|
||||||
|
raw = (msg.get("content") or "").strip()
|
||||||
|
if not raw:
|
||||||
|
raw = (msg.get("reasoning") or "").strip()
|
||||||
|
else:
|
||||||
raw = result["content"][0]["text"].strip()
|
raw = result["content"][0]["text"].strip()
|
||||||
# Parse just the first integer from Haiku's response
|
m = re.search(
|
||||||
m = re.search(r"\b(\d+)\b", raw)
|
r"\b(\d+)\b", raw[::-1]
|
||||||
|
) # take LAST integer (rerank models often reason first)
|
||||||
if m:
|
if m:
|
||||||
pick = int(m.group(1))
|
pick = int(m.group(1)[::-1])
|
||||||
if 1 <= pick <= len(candidates):
|
if 1 <= pick <= len(candidates):
|
||||||
chosen_idx = candidates[pick - 1]
|
chosen_idx = candidates[pick - 1]
|
||||||
reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
|
reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
|
||||||
return reordered
|
return reordered
|
||||||
break # Got a response, even if unparseable — don't retry
|
break
|
||||||
except (_socket.timeout, TimeoutError):
|
except (_socket.timeout, TimeoutError):
|
||||||
if _attempt < 2:
|
if _attempt < 2:
|
||||||
import time as _time
|
import time as _time
|
||||||
|
|
||||||
_time.sleep(3) # brief pause then retry
|
_time.sleep(3)
|
||||||
# else fall through to return rankings
|
|
||||||
except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
|
except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
|
||||||
break # Non-timeout error — fall back immediately
|
break
|
||||||
|
|
||||||
return rankings
|
return rankings
|
||||||
|
|
||||||
@@ -2919,6 +2948,8 @@ def run_benchmark(
|
|||||||
skip_precompute=False,
|
skip_precompute=False,
|
||||||
split_file=None,
|
split_file=None,
|
||||||
split_subset=None,
|
split_subset=None,
|
||||||
|
llm_backend="anthropic",
|
||||||
|
llm_base_url="",
|
||||||
):
|
):
|
||||||
"""Run the full benchmark.
|
"""Run the full benchmark.
|
||||||
|
|
||||||
@@ -2947,10 +2978,14 @@ def run_benchmark(
|
|||||||
api_key = ""
|
api_key = ""
|
||||||
if llm_rerank_enabled or mode == "diary":
|
if llm_rerank_enabled or mode == "diary":
|
||||||
api_key = _load_api_key(llm_key)
|
api_key = _load_api_key(llm_key)
|
||||||
if not api_key:
|
# Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama
|
||||||
|
# daemon with the requested model pulled is enough. Diary mode is always anthropic.
|
||||||
|
needs_key = (llm_backend == "anthropic") or (mode == "diary")
|
||||||
|
if needs_key and not api_key:
|
||||||
print(
|
print(
|
||||||
"ERROR: --llm-rerank / --mode diary requires an API key. "
|
"ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. "
|
||||||
"Set ANTHROPIC_API_KEY or use --llm-key."
|
"Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass "
|
||||||
|
"--llm-backend ollama."
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
@@ -3100,7 +3135,15 @@ def run_benchmark(
|
|||||||
if llm_rerank_enabled:
|
if llm_rerank_enabled:
|
||||||
rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
|
rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
|
||||||
rankings = llm_rerank(
|
rankings = llm_rerank(
|
||||||
question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model
|
question,
|
||||||
|
rankings,
|
||||||
|
corpus,
|
||||||
|
corpus_ids,
|
||||||
|
api_key,
|
||||||
|
top_k=rerank_pool,
|
||||||
|
model=llm_model,
|
||||||
|
backend=llm_backend,
|
||||||
|
base_url=llm_base_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Evaluate at session level
|
# Evaluate at session level
|
||||||
@@ -3276,7 +3319,21 @@ if __name__ == "__main__":
|
|||||||
default="claude-haiku-4-5-20251001",
|
default="claude-haiku-4-5-20251001",
|
||||||
help="Model for LLM re-ranking and diary ingest "
|
help="Model for LLM re-ranking and diary ingest "
|
||||||
"(default: claude-haiku-4-5-20251001). "
|
"(default: claude-haiku-4-5-20251001). "
|
||||||
"Use 'claude-sonnet-4-6' for Sonnet comparison.",
|
"Use 'claude-sonnet-4-6' for Sonnet comparison. "
|
||||||
|
"With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--llm-backend",
|
||||||
|
choices=["anthropic", "ollama"],
|
||||||
|
default="anthropic",
|
||||||
|
help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's "
|
||||||
|
"/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible "
|
||||||
|
"/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--llm-base-url",
|
||||||
|
default="",
|
||||||
|
help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--diary-cache",
|
"--diary-cache",
|
||||||
@@ -3380,4 +3437,6 @@ if __name__ == "__main__":
|
|||||||
args.skip_precompute,
|
args.skip_precompute,
|
||||||
split_file=args.split_file,
|
split_file=args.split_file,
|
||||||
split_subset=split_subset,
|
split_subset=split_subset,
|
||||||
|
llm_backend=args.llm_backend,
|
||||||
|
llm_base_url=args.llm_base_url,
|
||||||
)
|
)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user