frozen_walkforward.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. from __future__ import annotations
  2. import copy
  3. import json
  4. from dataclasses import dataclass
  5. from typing import Any, Callable, Iterable, Mapping, Sequence
  6. import pandas as pd
  7. from backtest.engine import compute_metrics, run_backtest
  8. from backtest.utility import core_utility, utility_from_metrics, utility_status
  9. from features.quality import enforce_feature_information_gate
  10. from backtest.walkforward import WindowSpec
  11. from features.pipeline import build_feature_table
  12. from model.policy import build_exposure_plan
  13. from model.scores import build_scores
  14. from model.state_machine import run_state_machine
  15. @dataclass(frozen=True)
  16. class HypothesisCandidate:
  17. candidate_id: str
  18. overrides: dict[str, Any]
  19. DEFAULT_HYPOTHESIS_CANDIDATES: tuple[HypothesisCandidate, ...] = (
  20. HypothesisCandidate(
  21. candidate_id='defensive',
  22. overrides={
  23. 'policy': {
  24. 'trend': 0.80,
  25. 'euphoric_late': 0.30,
  26. 'chop': 0.20,
  27. 'repair_rebound_base': 0.30,
  28. 'repair_rebound_max': 0.65,
  29. },
  30. 'trading': {
  31. 'max_daily_exposure_change': 0.20,
  32. },
  33. },
  34. ),
  35. HypothesisCandidate(candidate_id='baseline', overrides={}),
  36. HypothesisCandidate(
  37. candidate_id='balanced_capture',
  38. overrides={
  39. 'policy': {
  40. 'trend': 0.95,
  41. 'euphoric_late': 0.65,
  42. 'chop': 0.35,
  43. 'repair_rebound_base': 0.40,
  44. 'repair_rebound_max': 0.85,
  45. },
  46. 'trading': {
  47. 'max_daily_exposure_change': 0.30,
  48. },
  49. },
  50. ),
  51. HypothesisCandidate(
  52. candidate_id='pro_risk',
  53. overrides={
  54. 'policy': {
  55. 'trend': 1.00,
  56. 'euphoric_late': 0.70,
  57. 'chop': 0.45,
  58. 'repair_rebound_base': 0.50,
  59. 'repair_rebound_max': 0.95,
  60. },
  61. 'trading': {
  62. 'max_daily_exposure_change': 0.35,
  63. },
  64. },
  65. ),
  66. )
  67. StrategyRunner = Callable[[pd.DataFrame, dict[str, Any]], tuple[pd.DataFrame, pd.DataFrame, dict[str, float]]]
  68. def _deep_merge_dict(base: Mapping[str, Any], overrides: Mapping[str, Any]) -> dict[str, Any]:
  69. out = copy.deepcopy(dict(base))
  70. for key, value in overrides.items():
  71. if isinstance(value, Mapping) and isinstance(out.get(key), Mapping):
  72. out[key] = _deep_merge_dict(dict(out[key]), value)
  73. else:
  74. out[key] = copy.deepcopy(value)
  75. return out
  76. def _resolve_utility(metrics: Mapping[str, float], config: Mapping[str, Any] | None = None) -> tuple[float, str]:
  77. evaluation_cfg = dict((config or {}).get('evaluation', {}))
  78. utility_total_score = float(
  79. metrics.get(
  80. 'utility_total_score',
  81. utility_from_metrics(
  82. dict(metrics),
  83. upside_target=float(evaluation_cfg.get('utility_upside_target', 0.55)),
  84. turnover_penalty_start=float(evaluation_cfg.get('utility_turnover_penalty_start', 8.0)),
  85. turnover_penalty_rate=float(evaluation_cfg.get('utility_turnover_penalty_rate', 0.010)),
  86. ),
  87. )
  88. )
  89. utility_state = str(metrics.get('utility_status', utility_status(utility_total_score)))
  90. return utility_total_score, utility_state
  91. def run_strategy_bundle(df: pd.DataFrame, config: dict[str, Any]) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, float]]:
  92. featured = build_feature_table(df)
  93. enforce_feature_information_gate(featured, config)
  94. scored = build_scores(featured)
  95. stated = run_state_machine(scored, config)
  96. planned = build_exposure_plan(stated, config)
  97. ledger, metrics = run_backtest(planned, config)
  98. utility_total_score, utility_state = _resolve_utility(metrics, config)
  99. out_metrics = dict(metrics)
  100. out_metrics['utility_total_score'] = utility_total_score
  101. out_metrics['utility_status'] = utility_state
  102. return planned, ledger, out_metrics
  103. def normalize_hypothesis_candidates(raw_candidates: Iterable[Mapping[str, Any]] | None) -> list[HypothesisCandidate]:
  104. if raw_candidates is None:
  105. return [copy.deepcopy(candidate) for candidate in DEFAULT_HYPOTHESIS_CANDIDATES]
  106. candidates: list[HypothesisCandidate] = []
  107. for idx, item in enumerate(raw_candidates):
  108. candidate_id = str(item.get('id', item.get('candidate_id', f'candidate_{idx + 1}'))).strip()
  109. if not candidate_id:
  110. raise ValueError(f'Candidate index {idx} is missing an id.')
  111. overrides_raw = item.get('overrides', {})
  112. if not isinstance(overrides_raw, Mapping):
  113. raise ValueError(f'Candidate {candidate_id} overrides must be an object.')
  114. candidates.append(HypothesisCandidate(candidate_id=candidate_id, overrides=dict(overrides_raw)))
  115. if not candidates:
  116. raise ValueError('At least one hypothesis candidate is required.')
  117. ids = [candidate.candidate_id for candidate in candidates]
  118. if len(set(ids)) != len(ids):
  119. raise ValueError(f'Duplicate candidate ids found: {ids}')
  120. return candidates
  121. def _candidate_config(base_config: Mapping[str, Any], candidate: HypothesisCandidate) -> dict[str, Any]:
  122. merged = _deep_merge_dict(base_config, candidate.overrides)
  123. merged['_candidate_id'] = candidate.candidate_id
  124. return merged
  125. def _prefixed_metrics(prefix: str, metrics: Mapping[str, Any]) -> dict[str, Any]:
  126. out: dict[str, Any] = {}
  127. for key, value in metrics.items():
  128. if isinstance(value, (int, float)):
  129. out[f'{prefix}_{key}'] = float(value)
  130. else:
  131. out[f'{prefix}_{key}'] = value
  132. return out
  133. def _compute_window_metrics(ledger: pd.DataFrame, config: Mapping[str, Any] | None = None) -> dict[str, float]:
  134. required_columns = {'strategy_return_net', 'asset_exec_return', 'turnover'}
  135. if not required_columns.issubset(ledger.columns):
  136. raise ValueError(f'Ledger is missing required columns: {sorted(required_columns - set(ledger.columns))}')
  137. metrics = compute_metrics(
  138. strategy_returns=ledger['strategy_return_net'],
  139. benchmark_returns=ledger['asset_exec_return'],
  140. turnover=ledger['turnover'],
  141. )
  142. utility_total_score, utility_state = _resolve_utility(metrics, config)
  143. out_metrics = dict(metrics)
  144. out_metrics['utility_total_score'] = utility_total_score
  145. out_metrics['utility_status'] = utility_state
  146. return out_metrics
  147. def _window_row_base(window: WindowSpec) -> dict[str, Any]:
  148. return {
  149. 'train_start': window.train_start,
  150. 'train_end': window.train_end,
  151. 'test_start': window.test_start,
  152. 'test_end': window.test_end,
  153. }
  154. def _clip(value: float, lower: float, upper: float) -> float:
  155. return float(min(max(value, lower), upper))
  156. def _safe_float(value: Any, default: float = 0.0) -> float:
  157. try:
  158. return float(value)
  159. except (TypeError, ValueError):
  160. return float(default)
  161. def _resolve_candidate_selection_settings(config: Mapping[str, Any]) -> dict[str, Any]:
  162. frozen_cfg = dict((config or {}).get('frozen_validation', {}))
  163. evaluation_cfg = dict((config or {}).get('evaluation', {}))
  164. cfg = dict(frozen_cfg.get('candidate_selection', {}))
  165. return {
  166. 'use_hard_constraints': bool(cfg.get('use_hard_constraints', True)),
  167. 'upside_capture_min': float(cfg.get('upside_capture_min', 0.28)),
  168. 'max_drawdown_ratio_vs_benchmark': float(cfg.get('max_drawdown_ratio_vs_benchmark', 0.72)),
  169. 'annual_turnover_soft_max': float(cfg.get('annual_turnover_soft_max', 18.0)),
  170. 'annual_return_override_abs': float(cfg.get('annual_return_override_abs', 0.05)),
  171. 'annual_return_override_ratio': float(cfg.get('annual_return_override_ratio', 0.40)),
  172. 'return_ratio_weight': float(cfg.get('return_ratio_weight', 0.30)),
  173. 'upside_weight': float(cfg.get('upside_weight', 0.30)),
  174. 'drawdown_weight': float(cfg.get('drawdown_weight', 0.20)),
  175. 'sharpe_delta_weight': float(cfg.get('sharpe_delta_weight', 0.10)),
  176. 'stability_weight': float(cfg.get('stability_weight', 0.10)),
  177. 'turnover_penalty_per_unit': float(cfg.get('turnover_penalty_per_unit', 0.015)),
  178. 'score_cap': float(cfg.get('score_cap', 1.2)),
  179. 'upside_target': float(cfg.get('upside_target', 0.45)),
  180. 'drawdown_improvement_target': float(cfg.get('drawdown_improvement_target', 0.35)),
  181. 'sharpe_delta_shift': float(cfg.get('sharpe_delta_shift', 0.05)),
  182. 'sharpe_delta_scale': float(cfg.get('sharpe_delta_scale', 0.15)),
  183. 'turnover_penalty_start': float(cfg.get('turnover_penalty_start', 12.0)),
  184. 'core_utility_floor': float(cfg.get('core_utility_floor', cfg.get('utility_floor', -0.05))),
  185. 'core_utility_target': float(cfg.get('core_utility_target', cfg.get('utility_target', 0.10))),
  186. 'utility_upside_target': float(evaluation_cfg.get('utility_upside_target', 0.55)),
  187. 'fallback_mode': str(cfg.get('fallback_mode', 'closest_to_feasible_frontier')).strip().lower(),
  188. }
  189. def _compute_selection_score(metrics: Mapping[str, Any], settings: Mapping[str, Any]) -> tuple[float, dict[str, float]]:
  190. annual_return = _safe_float(metrics.get('annual_return'))
  191. benchmark_return = _safe_float(metrics.get('benchmark_return'))
  192. upside_capture = _safe_float(metrics.get('upside_capture'))
  193. max_drawdown = _safe_float(metrics.get('max_drawdown'))
  194. benchmark_max_drawdown = _safe_float(metrics.get('benchmark_max_drawdown'))
  195. sharpe_delta = _safe_float(metrics.get('sharpe_delta'))
  196. annual_turnover = _safe_float(metrics.get('annual_turnover'))
  197. score_cap = float(settings['score_cap'])
  198. upside_target = max(float(settings['upside_target']), 1e-12)
  199. drawdown_target = max(float(settings['drawdown_improvement_target']), 1e-12)
  200. sharpe_scale = max(float(settings['sharpe_delta_scale']), 1e-12)
  201. if benchmark_return > 0.05:
  202. return_ratio = _clip(annual_return / benchmark_return, 0.0, score_cap)
  203. else:
  204. return_ratio = _clip(annual_return / 0.10, 0.0, score_cap)
  205. upside_score = _clip((upside_capture - 0.15) / max(upside_target - 0.15, 1e-12), 0.0, score_cap)
  206. if benchmark_max_drawdown > 1e-12:
  207. drawdown_improvement = (benchmark_max_drawdown - max_drawdown) / benchmark_max_drawdown
  208. else:
  209. drawdown_improvement = 0.0
  210. core_utility_value = _safe_float(
  211. metrics.get(
  212. 'core_utility_score',
  213. core_utility(
  214. sharpe_delta=sharpe_delta,
  215. drawdown_improvement=drawdown_improvement,
  216. upside_capture=upside_capture,
  217. upside_target=float(settings['utility_upside_target']),
  218. ),
  219. )
  220. )
  221. drawdown_score = _clip(drawdown_improvement / drawdown_target, 0.0, score_cap)
  222. sharpe_delta_score = _clip((sharpe_delta + float(settings['sharpe_delta_shift'])) / sharpe_scale, 0.0, score_cap)
  223. stability_score = _clip(
  224. (core_utility_value - float(settings['core_utility_floor']))
  225. / max(float(settings['core_utility_target']) - float(settings['core_utility_floor']), 1e-12),
  226. 0.0,
  227. score_cap,
  228. )
  229. turnover_penalty = max(0.0, annual_turnover - float(settings['turnover_penalty_start'])) * float(
  230. settings['turnover_penalty_per_unit']
  231. )
  232. score = (
  233. float(settings['return_ratio_weight']) * return_ratio
  234. + float(settings['upside_weight']) * upside_score
  235. + float(settings['drawdown_weight']) * drawdown_score
  236. + float(settings['sharpe_delta_weight']) * sharpe_delta_score
  237. + float(settings['stability_weight']) * stability_score
  238. - turnover_penalty
  239. )
  240. return score, {
  241. 'return_ratio': return_ratio,
  242. 'upside_score': upside_score,
  243. 'drawdown_score': drawdown_score,
  244. 'sharpe_delta_score': sharpe_delta_score,
  245. 'core_utility_value': core_utility_value,
  246. 'stability_score': stability_score,
  247. 'turnover_penalty': turnover_penalty,
  248. }
  249. def _evaluate_hard_constraints(metrics: Mapping[str, Any], settings: Mapping[str, Any]) -> tuple[bool, list[str]]:
  250. reasons: list[str] = []
  251. upside_capture = _safe_float(metrics.get('upside_capture'))
  252. max_drawdown = _safe_float(metrics.get('max_drawdown'))
  253. benchmark_max_drawdown = _safe_float(metrics.get('benchmark_max_drawdown'))
  254. annual_turnover = _safe_float(metrics.get('annual_turnover'))
  255. annual_return = _safe_float(metrics.get('annual_return'))
  256. benchmark_return = _safe_float(metrics.get('benchmark_return'))
  257. if upside_capture < float(settings['upside_capture_min']):
  258. reasons.append('upside_capture_below_min')
  259. if benchmark_max_drawdown > 1e-12:
  260. drawdown_ratio = max_drawdown / benchmark_max_drawdown
  261. if drawdown_ratio > float(settings['max_drawdown_ratio_vs_benchmark']):
  262. reasons.append('drawdown_ratio_above_max')
  263. turnover_cap = float(settings['annual_turnover_soft_max'])
  264. return_override_threshold = max(
  265. float(settings['annual_return_override_abs']),
  266. float(settings['annual_return_override_ratio']) * max(benchmark_return, 0.0),
  267. )
  268. if annual_turnover > turnover_cap and annual_return < return_override_threshold:
  269. reasons.append('turnover_above_soft_max_without_return_override')
  270. return len(reasons) == 0, reasons
  271. def _constraint_distance(metrics: Mapping[str, Any], settings: Mapping[str, Any]) -> tuple[float, dict[str, float]]:
  272. upside_capture = _safe_float(metrics.get('upside_capture'))
  273. max_drawdown = _safe_float(metrics.get('max_drawdown'))
  274. benchmark_max_drawdown = _safe_float(metrics.get('benchmark_max_drawdown'))
  275. annual_turnover = _safe_float(metrics.get('annual_turnover'))
  276. annual_return = _safe_float(metrics.get('annual_return'))
  277. benchmark_return = _safe_float(metrics.get('benchmark_return'))
  278. upside_min = max(float(settings['upside_capture_min']), 1e-12)
  279. drawdown_max = max(float(settings['max_drawdown_ratio_vs_benchmark']), 1e-12)
  280. turnover_soft_max = max(float(settings['annual_turnover_soft_max']), 1e-12)
  281. return_override_threshold = max(
  282. float(settings['annual_return_override_abs']),
  283. float(settings['annual_return_override_ratio']) * max(benchmark_return, 0.0),
  284. )
  285. upside_gap = max(0.0, upside_min - upside_capture) / upside_min
  286. drawdown_ratio = (max_drawdown / benchmark_max_drawdown) if benchmark_max_drawdown > 1e-12 else 0.0
  287. drawdown_gap = max(0.0, drawdown_ratio - drawdown_max) / drawdown_max
  288. turnover_gap = 0.0
  289. if annual_turnover > turnover_soft_max and annual_return < return_override_threshold:
  290. turnover_gap = (annual_turnover - turnover_soft_max) / turnover_soft_max
  291. violation_distance = 0.50 * upside_gap + 0.30 * drawdown_gap + 0.20 * turnover_gap
  292. return float(violation_distance), {
  293. 'upside_gap': float(upside_gap),
  294. 'drawdown_gap': float(drawdown_gap),
  295. 'turnover_gap': float(turnover_gap),
  296. }
  297. def run_frozen_walkforward(
  298. raw: pd.DataFrame,
  299. config: Mapping[str, Any],
  300. windows: Sequence[WindowSpec],
  301. *,
  302. candidates: Sequence[HypothesisCandidate] | None = None,
  303. min_train_rows: int = 120,
  304. min_test_rows: int = 40,
  305. strategy_runner: StrategyRunner | None = None,
  306. ) -> tuple[pd.DataFrame, dict[str, Any]]:
  307. if min_train_rows <= 0:
  308. raise ValueError('min_train_rows must be positive.')
  309. if min_test_rows <= 0:
  310. raise ValueError('min_test_rows must be positive.')
  311. runner = strategy_runner or run_strategy_bundle
  312. candidate_list = list(candidates or DEFAULT_HYPOTHESIS_CANDIDATES)
  313. if not candidate_list:
  314. raise ValueError('At least one candidate is required for frozen walk-forward.')
  315. selection_settings = _resolve_candidate_selection_settings(config)
  316. rows: list[dict[str, Any]] = []
  317. for window in windows:
  318. train_slice = raw.loc[window.train_start:window.train_end].copy()
  319. test_slice = raw.loc[window.test_start:window.test_end].copy()
  320. row = _window_row_base(window)
  321. row['train_rows'] = int(len(train_slice))
  322. row['test_rows'] = int(len(test_slice))
  323. row['candidate_count'] = int(len(candidate_list))
  324. if len(train_slice) < min_train_rows:
  325. row['status'] = 'skipped_insufficient_train'
  326. rows.append(row)
  327. continue
  328. if len(test_slice) < min_test_rows:
  329. row['status'] = 'skipped_insufficient_test'
  330. rows.append(row)
  331. continue
  332. selected_candidate: HypothesisCandidate | None = None
  333. selected_train_metrics: dict[str, float] | None = None
  334. selected_train_utility = float('-inf')
  335. selected_train_score = float('-inf')
  336. selected_train_hard_pass = False
  337. selected_train_constraint_failures: list[str] = []
  338. selected_train_violation_distance = 0.0
  339. selected_train_violation_components: dict[str, float] = {}
  340. selection_mode = 'constraint_score'
  341. candidate_evaluations: list[dict[str, Any]] = []
  342. for candidate in candidate_list:
  343. candidate_config = _candidate_config(config, candidate)
  344. _, _, train_metrics_raw = runner(train_slice, candidate_config)
  345. train_metrics = dict(train_metrics_raw)
  346. utility_value, _ = _resolve_utility(train_metrics)
  347. train_metrics['utility_total_score'] = utility_value
  348. train_metrics['utility_status'] = utility_status(utility_value)
  349. hard_pass, hard_fail_reasons = _evaluate_hard_constraints(train_metrics, selection_settings)
  350. score_value, score_components = _compute_selection_score(train_metrics, selection_settings)
  351. violation_distance, violation_components = _constraint_distance(train_metrics, selection_settings)
  352. candidate_evaluations.append(
  353. {
  354. 'candidate': candidate,
  355. 'metrics': train_metrics,
  356. 'utility': utility_value,
  357. 'hard_pass': hard_pass,
  358. 'hard_fail_reasons': hard_fail_reasons,
  359. 'selection_score': score_value,
  360. 'selection_score_components': score_components,
  361. 'violation_distance': violation_distance,
  362. 'violation_components': violation_components,
  363. }
  364. )
  365. use_hard_constraints = bool(selection_settings['use_hard_constraints'])
  366. ranking_pool = (
  367. [item for item in candidate_evaluations if item['hard_pass']]
  368. if use_hard_constraints
  369. else candidate_evaluations
  370. )
  371. if ranking_pool:
  372. for item in ranking_pool:
  373. score_value = float(item['selection_score'])
  374. if score_value > selected_train_score:
  375. selected_train_score = score_value
  376. selected_candidate = item['candidate']
  377. selected_train_metrics = item['metrics']
  378. selected_train_utility = float(item['utility'])
  379. selected_train_hard_pass = bool(item['hard_pass'])
  380. selected_train_constraint_failures = list(item['hard_fail_reasons'])
  381. selected_train_violation_distance = float(item['violation_distance'])
  382. selected_train_violation_components = dict(item['violation_components'])
  383. else:
  384. fallback_mode = str(selection_settings.get('fallback_mode', 'closest_to_feasible_frontier')).strip().lower()
  385. if fallback_mode == 'closest_to_feasible_frontier':
  386. selection_mode = 'frontier_fallback_no_hard_pass'
  387. selected_fallback_score = float('-inf')
  388. for item in candidate_evaluations:
  389. fallback_score = -float(item['violation_distance']) + 0.25 * float(item['selection_score'])
  390. utility_value = float(item['utility'])
  391. if (
  392. fallback_score > selected_fallback_score
  393. or (
  394. fallback_score == selected_fallback_score
  395. and float(item['selection_score']) > selected_train_score
  396. )
  397. or (
  398. fallback_score == selected_fallback_score
  399. and float(item['selection_score']) == selected_train_score
  400. and utility_value > selected_train_utility
  401. )
  402. ):
  403. selected_fallback_score = fallback_score
  404. selected_train_utility = utility_value
  405. selected_candidate = item['candidate']
  406. selected_train_metrics = item['metrics']
  407. selected_train_score = float(item['selection_score'])
  408. selected_train_hard_pass = bool(item['hard_pass'])
  409. selected_train_constraint_failures = list(item['hard_fail_reasons'])
  410. selected_train_violation_distance = float(item['violation_distance'])
  411. selected_train_violation_components = dict(item['violation_components'])
  412. else:
  413. selection_mode = 'utility_fallback_no_hard_pass'
  414. for item in candidate_evaluations:
  415. utility_value = float(item['utility'])
  416. if utility_value > selected_train_utility:
  417. selected_train_utility = utility_value
  418. selected_candidate = item['candidate']
  419. selected_train_metrics = item['metrics']
  420. selected_train_score = float(item['selection_score'])
  421. selected_train_hard_pass = bool(item['hard_pass'])
  422. selected_train_constraint_failures = list(item['hard_fail_reasons'])
  423. selected_train_violation_distance = float(item['violation_distance'])
  424. selected_train_violation_components = dict(item['violation_components'])
  425. hard_pass_count = int(sum(1 for item in candidate_evaluations if bool(item['hard_pass'])))
  426. ranking_brief = [
  427. {
  428. 'candidate_id': item['candidate'].candidate_id,
  429. 'hard_pass': bool(item['hard_pass']),
  430. 'selection_score': float(item['selection_score']),
  431. 'train_utility_total_score': float(item['utility']),
  432. 'hard_fail_reasons': list(item['hard_fail_reasons']),
  433. 'violation_distance': float(item['violation_distance']),
  434. }
  435. for item in candidate_evaluations
  436. ]
  437. ranking_brief.sort(key=lambda x: (-x['hard_pass'], -x['selection_score'], -x['train_utility_total_score']))
  438. if selected_candidate is None or selected_train_metrics is None:
  439. row['status'] = 'skipped_no_candidate'
  440. rows.append(row)
  441. continue
  442. combined_slice = raw.loc[window.train_start:window.test_end].copy()
  443. candidate_config = _candidate_config(config, selected_candidate)
  444. _, combined_ledger, _ = runner(combined_slice, candidate_config)
  445. frozen_test_ledger = combined_ledger.loc[window.test_start:window.test_end].copy()
  446. if len(frozen_test_ledger) < min_test_rows:
  447. row['status'] = 'skipped_insufficient_test'
  448. rows.append(row)
  449. continue
  450. test_metrics = _compute_window_metrics(frozen_test_ledger, candidate_config)
  451. row.update(
  452. {
  453. 'status': 'ok',
  454. 'selected_candidate_id': selected_candidate.candidate_id,
  455. 'selection_mode': selection_mode,
  456. 'train_candidate_hard_pass_count': hard_pass_count,
  457. 'train_candidate_total_count': int(len(candidate_evaluations)),
  458. 'selected_train_selection_score': float(selected_train_score),
  459. 'selected_train_hard_pass': bool(selected_train_hard_pass),
  460. 'selected_train_constraint_failures': json.dumps(
  461. selected_train_constraint_failures,
  462. ensure_ascii=False,
  463. sort_keys=True,
  464. ),
  465. 'selected_train_violation_distance': float(selected_train_violation_distance),
  466. 'selected_train_violation_components': json.dumps(
  467. selected_train_violation_components,
  468. ensure_ascii=False,
  469. sort_keys=True,
  470. ),
  471. 'train_candidate_rankings': json.dumps(ranking_brief, ensure_ascii=False, sort_keys=True),
  472. 'selected_candidate_overrides': json.dumps(
  473. selected_candidate.overrides,
  474. ensure_ascii=False,
  475. sort_keys=True,
  476. ),
  477. }
  478. )
  479. row.update(_prefixed_metrics('train', selected_train_metrics))
  480. row.update(_prefixed_metrics('test', test_metrics))
  481. rows.append(row)
  482. board = pd.DataFrame(rows)
  483. if board.empty:
  484. board = pd.DataFrame(columns=['status'])
  485. ok_board = board[board['status'] == 'ok'].copy() if 'status' in board.columns else pd.DataFrame()
  486. selected_distribution = (
  487. ok_board['selected_candidate_id'].value_counts().to_dict() if 'selected_candidate_id' in ok_board.columns else {}
  488. )
  489. status_counts = board['status'].value_counts().to_dict() if 'status' in board.columns else {}
  490. selection_mode_distribution = (
  491. ok_board['selection_mode'].value_counts().to_dict() if not ok_board.empty and 'selection_mode' in ok_board.columns else {}
  492. )
  493. windows_with_hard_pass_candidate_count = (
  494. int((ok_board['train_candidate_hard_pass_count'] > 0).sum())
  495. if not ok_board.empty and 'train_candidate_hard_pass_count' in ok_board.columns
  496. else 0
  497. )
  498. hard_pass_window_ratio = (
  499. float(windows_with_hard_pass_candidate_count / len(ok_board))
  500. if len(ok_board) > 0
  501. else 0.0
  502. )
  503. positive_window_ratio = (
  504. float((ok_board['test_utility_total_score'] > 0.0).mean())
  505. if not ok_board.empty and 'test_utility_total_score' in ok_board.columns
  506. else 0.0
  507. )
  508. fallback_distance_distribution = (
  509. ok_board.loc[
  510. ok_board['selection_mode'].isin({'frontier_fallback_no_hard_pass', 'utility_fallback_no_hard_pass'}),
  511. 'selected_train_violation_distance',
  512. ]
  513. .dropna()
  514. .tolist()
  515. if not ok_board.empty
  516. and 'selection_mode' in ok_board.columns
  517. and 'selected_train_violation_distance' in ok_board.columns
  518. else []
  519. )
  520. summary = {
  521. 'total_windows': int(len(windows)),
  522. 'processed_window_count': int(len(ok_board)),
  523. 'skipped_window_count': int(max(len(windows) - len(ok_board), 0)),
  524. 'positive_window_ratio': positive_window_ratio,
  525. 'positive_window_ratio_role': 'diagnostic_only',
  526. 'primary_acceptance_metrics': ['primary_window_success_ratio', 'hard_pass_window_ratio'],
  527. 'selected_candidate_distribution': selected_distribution,
  528. 'window_status_counts': status_counts,
  529. 'selection_mode_distribution': selection_mode_distribution,
  530. 'windows_with_hard_pass_candidate_count': windows_with_hard_pass_candidate_count,
  531. 'windows_without_hard_pass_candidate_count': int(max(len(ok_board) - windows_with_hard_pass_candidate_count, 0)),
  532. 'hard_pass_window_ratio': hard_pass_window_ratio,
  533. 'fallback_distance_distribution': [float(x) for x in fallback_distance_distribution],
  534. 'candidate_ids': [candidate.candidate_id for candidate in candidate_list],
  535. 'min_train_rows': int(min_train_rows),
  536. 'min_test_rows': int(min_test_rows),
  537. 'candidate_selection': selection_settings,
  538. }
  539. return board, summary