Browse Source

Import index-rotation project into cyb50-quant

erwin 1 month ago
parent
commit
ace7ef22fb
63 changed files with 2672 additions and 0 deletions
  1. 5 0
      .gitignore
  2. 0 0
      index-rotation/.codex
  3. 246 0
      index-rotation/README.md
  4. 39 0
      index-rotation/TASK_PROMPT.md
  5. 125 0
      index-rotation/TASK_PROMPT_PHASE2.md
  6. 29 0
      index-rotation/configs/instruments.yaml
  7. 7 0
      index-rotation/configs/strategy/top2_every_5_days.yaml
  8. 7 0
      index-rotation/configs/strategy/top2_weekly.yaml
  9. 1 0
      index-rotation/data/clean/.gitkeep
  10. BIN
      index-rotation/data/clean/chinext50/daily.parquet
  11. BIN
      index-rotation/data/clean/hs300/daily.parquet
  12. BIN
      index-rotation/data/clean/sse50/daily.parquet
  13. BIN
      index-rotation/data/clean/star50/daily.parquet
  14. 1 0
      index-rotation/data/features/.gitkeep
  15. BIN
      index-rotation/data/features/chinext50/daily.parquet
  16. BIN
      index-rotation/data/features/hs300/daily.parquet
  17. BIN
      index-rotation/data/features/sse50/daily.parquet
  18. BIN
      index-rotation/data/features/star50/daily.parquet
  19. 12 0
      index-rotation/data/meta/fetch_log.jsonl
  20. 158 0
      index-rotation/data/meta/manifest.json
  21. 1 0
      index-rotation/data/raw/.gitkeep
  22. BIN
      index-rotation/data/raw/chinext50/price.parquet
  23. BIN
      index-rotation/data/raw/hs300/price.parquet
  24. BIN
      index-rotation/data/raw/sse50/price.parquet
  25. BIN
      index-rotation/data/raw/star50/price.parquet
  26. 11 0
      index-rotation/outputs/backtests/top2_every_5_days/summary.json
  27. 11 0
      index-rotation/outputs/backtests/top2_weekly/summary.json
  28. 26 0
      index-rotation/pyproject.toml
  29. 1 0
      index-rotation/src/__init__.py
  30. 3 0
      index-rotation/src/backtest/__init__.py
  31. 170 0
      index-rotation/src/backtest/engine.py
  32. 30 0
      index-rotation/src/backtest/execution.py
  33. 49 0
      index-rotation/src/backtest/metrics.py
  34. 114 0
      index-rotation/src/backtest/run.py
  35. 1 0
      index-rotation/src/data/__init__.py
  36. 26 0
      index-rotation/src/data/backfill.py
  37. 28 0
      index-rotation/src/data/bootstrap.py
  38. 46 0
      index-rotation/src/data/cli_common.py
  39. 34 0
      index-rotation/src/data/config.py
  40. 18 0
      index-rotation/src/data/exceptions.py
  41. 99 0
      index-rotation/src/data/metadata.py
  42. 18 0
      index-rotation/src/data/models.py
  43. 313 0
      index-rotation/src/data/pipeline.py
  44. 10 0
      index-rotation/src/data/providers/__init__.py
  45. 76 0
      index-rotation/src/data/providers/akshare.py
  46. 21 0
      index-rotation/src/data/providers/base.py
  47. 26 0
      index-rotation/src/data/repair.py
  48. 55 0
      index-rotation/src/data/status.py
  49. 106 0
      index-rotation/src/data/storage.py
  50. 104 0
      index-rotation/src/data/transform.py
  51. 28 0
      index-rotation/src/data/update.py
  52. 4 0
      index-rotation/src/portfolio/__init__.py
  53. 18 0
      index-rotation/src/portfolio/allocator.py
  54. 37 0
      index-rotation/src/portfolio/rebalance.py
  55. 3 0
      index-rotation/src/signals/__init__.py
  56. 24 0
      index-rotation/src/signals/ranker.py
  57. 35 0
      index-rotation/src/signals/scorer.py
  58. 28 0
      index-rotation/src/signals/selector.py
  59. 21 0
      index-rotation/src/signals/trend.py
  60. 54 0
      index-rotation/tests/test_cli.py
  61. 79 0
      index-rotation/tests/test_phase2_backtest.py
  62. 115 0
      index-rotation/tests/test_phase2_signals.py
  63. 199 0
      index-rotation/tests/test_pipeline.py

+ 5 - 0
.gitignore

@@ -40,6 +40,11 @@ wheels/
 # Local config
 .openclaw/
 
+# Embedded project runtime artifacts
+index-rotation/.venv/
+index-rotation/.codex/
+index-rotation/.codex-orchestrator/
+
 # Windows reserved device name
 NUL
 nul

+ 0 - 0
index-rotation/.codex


+ 246 - 0
index-rotation/README.md

@@ -0,0 +1,246 @@
+# A 股四指数轮动系统 v1
+
+本仓库当前已实现:
+
+- 第一阶段:数据层
+- 第二阶段:信号层、组合层、回测层最小可用版本
+
+当前仍然是指数研究层回测,不包含个股逻辑、ETF 选择逻辑或实盘交易接口。
+
+## 覆盖范围
+
+- 固定标的:上证50、沪深300、创业板50、科创50
+- 数据频率:日线
+- 数据来源:本地 `data/` 分层
+- 回测层只读取本地 `clean/features` 衍生结果,不访问网络
+
+## 当前模块
+
+```text
+configs/
+  instruments.yaml
+  strategy/
+    top2_weekly.yaml
+    top2_every_5_days.yaml
+data/
+  raw/
+  clean/
+  features/
+  meta/
+src/
+  data/
+  signals/
+    trend.py
+    ranker.py
+    scorer.py
+    selector.py
+  portfolio/
+    allocator.py
+    rebalance.py
+  backtest/
+    execution.py
+    engine.py
+    metrics.py
+    run.py
+tests/
+```
+
+## 数据分层
+
+- `raw`:provider 原始缓存
+- `clean`:标准化日线
+- `features`:只基于本地 `clean` 计算的 rolling 特征
+
+第二阶段 v1 使用的核心字段来自 `features`:
+
+- `close`
+- `daily_return`
+- `ret_5d` / `ret_10d` / `ret_20d` / `ret_60d`
+- `ma_20` / `ma_60`
+- `vol_10d` / `vol_20d`
+
+## 策略 v1 规则
+
+### 1. 趋势过滤
+
+在 `signal_date` 对每个指数检查:
+
+- `close > ma_20`
+- `close > ma_60`
+- `ma_20 > ma_60`
+- `ret_20d > 0`
+
+满足至少 2 条时,`trend_pass = True`。
+
+### 2. 综合打分
+
+只允许 `trend_pass=True` 的标的进入最终选择,但横截面 rank 仍基于当日 4 个指数计算。
+
+动量分数:
+
+- `ret_5d * 0.20`
+- `ret_10d * 0.25`
+- `ret_20d * 0.30`
+- `ret_60d * 0.25`
+
+风险惩罚:
+
+- `vol_10d * 0.60`
+- `vol_20d * 0.40`
+
+最终分数:
+
+```text
+final_score = score_mom - 0.30 * score_risk_penalty
+```
+
+### 3. 组合规则
+
+- `Top2`:若合格标的数 `>= 2`,持有前两名,各 `50%`
+- `Top2`:若合格标的数 `= 1`,单标的 `100%`
+- `Top1`:只持有第一名
+- 若合格标的数 `= 0`,空仓
+
+### 4. 调仓频率
+
+当前支持:
+
+- `weekly`
+- `every_5_days`
+- `daily`(用于测试和最小实现,不是主配置)
+
+## 信号与执行时点语义
+
+这是第二阶段最重要的约束:
+
+- `t` 日收盘后,使用 `t` 日 `close` 与历史数据生成信号
+- 在 `t+1` 日执行调仓
+- 不允许用 `t` 日收盘信号按 `t` 日收盘成交
+
+当前回测只有日频 close 数据,因此 v1 采用保守语义:
+
+- `t+1` 当天先计入旧仓位从 `t -> t+1` 的日收益
+- 再在 `t+1` 收盘执行调仓
+- 新仓位从 `t+2` 的日收益开始生效
+
+这能明确避免未来函数和时点错配。测试已覆盖该约束。
+
+## 回测输出
+
+回测至少输出:
+
+- `cumulative_return`
+- `annual_return`
+- `max_drawdown`
+- `annual_volatility`
+- `sharpe`
+- `calmar`
+- `turnover`
+- `rebalance_count`
+- `cash_days_ratio`
+
+同时输出:
+
+- 每日净值 `daily_nav.csv`
+- 每日持仓 `daily_holdings.csv`
+- 调仓记录 `rebalances.csv`
+
+## 运行方式
+
+Top2 周频:
+
+```bash
+python3 -m src.backtest.run --config configs/strategy/top2_weekly.yaml
+```
+
+Top2 每 5 个交易日:
+
+```bash
+python3 -m src.backtest.run --config configs/strategy/top2_every_5_days.yaml
+```
+
+默认输出目录:
+
+```text
+outputs/backtests/<config_name>/
+```
+
+例如:
+
+- `outputs/backtests/top2_weekly/summary.json`
+- `outputs/backtests/top2_weekly/daily_nav.csv`
+- `outputs/backtests/top2_weekly/daily_holdings.csv`
+- `outputs/backtests/top2_weekly/rebalances.csv`
+
+## 第一阶段数据 CLI
+
+首次全量初始化:
+
+```bash
+python3 -m src.data.bootstrap --all
+```
+
+按上次成功抓取位置增量更新:
+
+```bash
+python3 -m src.data.update --since-last
+```
+
+单标的历史回补:
+
+```bash
+python3 -m src.data.backfill --instrument sse50 --start 2003-12-31
+```
+
+从本地层修复下游层:
+
+```bash
+python3 -m src.data.repair --instrument sse50 --layer features
+```
+
+查看状态:
+
+```bash
+python3 -m src.data.status
+```
+
+## 依赖
+
+- `python >= 3.10`
+- `pandas`
+- `PyYAML`
+- `pyarrow`
+- `akshare`
+
+## 测试
+
+运行全量测试:
+
+```bash
+python3 -m unittest discover -s tests -v
+```
+
+当前测试覆盖:
+
+- 数据层编排与修复
+- 特征不使用未来数据
+- 趋势过滤规则
+- 排序与打分
+- `Top1 / Top2 / 空仓` 分配
+- `t` 日信号、`t+1` 执行的时点约束
+- 回测净值、持仓、调仓记录的基础正确性
+
+## 边界与后续
+
+当前版本仍有意保持最小化:
+
+- 不做 ETF 映射
+- 不做成交价建模扩展
+- 不做实盘交易接入
+- 不引入个股层逻辑
+
+若进入下一阶段,优先建议:
+
+- 增加成交成本和现金收益率的更细粒度建模
+- 增加参数化样本区间与输出报表
+- 增加基准对比与分年度绩效拆解

+ 39 - 0
index-rotation/TASK_PROMPT.md

@@ -0,0 +1,39 @@
+你现在要在当前仓库中实现 A 股指数轮动系统 v1 的数据层优先版。当前阶段严禁实现完整策略、信号引擎、仓位分配、回测执行,只做数据基础设施。
+
+标的固定为上证50、沪深300、创业板50、科创50。
+
+请完成:
+1)configs/instruments.yaml;
+2)provider 抽象;
+3)data/raw、data/clean、data/features、data/meta 目录结构;
+4)raw 原始缓存、clean 标准化层、features 特征层;
+5)manifest.json 和 fetch_log.jsonl;
+6)CLI:python -m src.data.bootstrap --all、python -m src.data.update --since-last、python -m src.data.backfill --instrument XXX --start YYYY-MM-DD、python -m src.data.repair --instrument XXX --layer features、python -m src.data.status;
+7)必要测试与 README。
+
+约束:
+- 使用 parquet;
+- 历史按真实起点抓取;
+- 默认只增量更新;
+- 未来回测只能读本地;
+- rolling 特征不能使用未来数据;
+- 请求要限速、重试、串行;
+- 项目分层清晰,可替换 provider;
+- 先支持价格指数;
+- 单指数最长历史与四指数共同样本区间都要在文档里说明;
+- 不要在本阶段实现完整回测、择时、Top1/Top2 轮动逻辑。
+
+建议指数起点:
+- 上证50:2003-12-31 或最早可得日期
+- 沪深300:2004-12-31 或最早可得日期
+- 创业板50:2010-05-31
+- 科创50:2019-12-31
+
+工作方式:
+- 先输出实现计划;
+- 再创建项目结构并逐步实现;
+- 每完成一部分就运行测试;
+- 最后总结已完成模块、未完成项、测试结果、假设与下一阶段建议。
+
+当你完全完成后,请执行:
+openclaw system event --text "Done: index-rotation data layer implemented" --mode now

+ 125 - 0
index-rotation/TASK_PROMPT_PHASE2.md

@@ -0,0 +1,125 @@
+你现在要在当前仓库中实现 A 股四指数轮动系统 v1 的第二阶段:信号层、组合层、回测层最小可用版本。
+
+重要背景:
+- 第一阶段数据层已经完成,并且本地数据已建库成功。
+- 标的固定为 4 个指数:上证50、沪深300、创业板50、科创50。
+- 当前仍然是“指数研究层回测”,不是 ETF 执行层。
+- 严禁引入个股逻辑、ETF 选择逻辑、实盘交易接口。
+
+本阶段目标:
+1)实现趋势过滤信号;
+2)实现相对强弱排序与风险惩罚打分;
+3)实现 Top1 / Top2 / 空仓组合构建;
+4)实现最小可用回测引擎;
+5)输出基础绩效指标;
+6)补必要测试与 README 更新。
+
+必须严格遵守的时点语义:
+- t 日收盘后,根据 t 日 close 与历史数据生成信号;
+- 在 t+1 日执行调仓;
+- 不允许使用 t 日收盘信号并按 t 日收盘成交;
+- 测试里必须覆盖这个约束,防止未来函数和时点错配。
+
+请实现的策略 v1 规则:
+
+一、趋势过滤
+对每个指数在 signal_date 计算以下规则:
+- close > ma_20
+- close > ma_60
+- ma_20 > ma_60
+- ret_20d > 0
+满足至少 2 条,trend_pass = True。
+
+二、综合打分
+仅对 trend_pass=True 的指数参与排序。
+打分由两部分组成:
+1. 动量分数(横截面 rank,标的池为当日 4 个指数)
+   - ret_5d 权重 0.20
+   - ret_10d 权重 0.25
+   - ret_20d 权重 0.30
+   - ret_60d 权重 0.25
+2. 风险惩罚(横截面 rank)
+   - vol_10d 权重 0.60
+   - vol_20d 权重 0.40
+最终:final_score = score_mom - 0.30 * score_risk_penalty
+
+三、组合规则
+- 若合格标的 >= 2:支持 Top2 等权(50/50)
+- 若合格标的 = 1:100% 持有该指数
+- 若合格标的 = 0:空仓
+- 同时支持 Top1 模式,作为可配置项
+
+四、调仓频率
+先实现:
+- weekly
+- every_5_days
+可选支持 daily,但不是重点。
+
+五、回测层
+- 使用本地 clean/features 数据
+- 使用 trade_date 序列推进
+- signal_date 与 execution_date 分离
+- 默认无摩擦回测,但要预留成本字段:commission_bps / slippage_bps
+- 默认 cash_return = 0
+
+六、指标输出
+至少输出:
+- cumulative_return
+- annual_return
+- max_drawdown
+- annual_volatility
+- sharpe
+- calmar
+- turnover
+- rebalance_count
+- cash_days_ratio
+并输出每日净值、每日持仓、调仓记录。
+
+七、建议代码结构
+建议新增:
+- src/signals/trend.py
+- src/signals/ranker.py
+- src/signals/scorer.py
+- src/signals/selector.py
+- src/portfolio/allocator.py
+- src/portfolio/rebalance.py
+- src/backtest/engine.py
+- src/backtest/execution.py
+- src/backtest/metrics.py
+如果你认为需要微调结构,可以调整,但必须保持清晰。
+
+八、CLI / 可运行入口
+请至少提供一个可运行入口,例如:
+- python3 -m src.backtest.run --config configs/strategy/top2_weekly.yaml
+或者等价命令。
+需要有最小配置文件支持 weekly / every_5_days 两个版本。
+
+九、测试要求
+至少覆盖:
+- t 日信号、t+1 执行的时点测试
+- trend filter 规则测试
+- ranking / scoring 测试
+- Top1 / Top2 / 空仓分配测试
+- 回测净值与调仓记录基本正确性测试
+
+十、文档要求
+更新 README,补充:
+- 第二阶段模块说明
+- 信号与执行时点语义
+- 如何运行 weekly / every_5_days 回测
+- 目前仍为指数研究层,不是实盘执行层
+
+工作方式:
+- 先输出实现计划;
+- 再按模块逐步实现;
+- 每完成一部分就运行对应测试;
+- 最后运行完整测试;
+- 给出已完成模块、未完成项、假设与下一步建议。
+
+运行环境要求:
+- 使用 python3,不要假设 python 命令存在;
+- 继续复用当前仓库和已有数据层;
+- 如需读取本地数据,请使用现有 data 目录。
+
+当你完全完成后,请执行:
+openclaw system event --text "Done: index-rotation phase2 signal/backtest implemented" --mode now

+ 29 - 0
index-rotation/configs/instruments.yaml

@@ -0,0 +1,29 @@
+instruments:
+  sse50:
+    name: 上证50
+    index_code: "000016"
+    provider_symbol: sh000016
+    exchange: SSE
+    price_type: price_index
+    bootstrap_start: "2003-12-31"
+  hs300:
+    name: 沪深300
+    index_code: "000300"
+    provider_symbol: sh000300
+    exchange: CSI
+    price_type: price_index
+    bootstrap_start: "2004-12-31"
+  chinext50:
+    name: 创业板50
+    index_code: "399673"
+    provider_symbol: sz399673
+    exchange: SZSE
+    price_type: price_index
+    bootstrap_start: "2010-05-31"
+  star50:
+    name: 科创50
+    index_code: "000688"
+    provider_symbol: sh000688
+    exchange: SSE
+    price_type: price_index
+    bootstrap_start: "2019-12-31"

+ 7 - 0
index-rotation/configs/strategy/top2_every_5_days.yaml

@@ -0,0 +1,7 @@
+name: top2_every_5_days
+top_n: 2
+rebalance_frequency: every_5_days
+commission_bps: 0.0
+slippage_bps: 0.0
+cash_return: 0.0
+start_date: "2019-12-31"

+ 7 - 0
index-rotation/configs/strategy/top2_weekly.yaml

@@ -0,0 +1,7 @@
+name: top2_weekly
+top_n: 2
+rebalance_frequency: weekly
+commission_bps: 0.0
+slippage_bps: 0.0
+cash_return: 0.0
+start_date: "2019-12-31"

+ 1 - 0
index-rotation/data/clean/.gitkeep

@@ -0,0 +1 @@
+

BIN
index-rotation/data/clean/chinext50/daily.parquet


BIN
index-rotation/data/clean/hs300/daily.parquet


BIN
index-rotation/data/clean/sse50/daily.parquet


BIN
index-rotation/data/clean/star50/daily.parquet


+ 1 - 0
index-rotation/data/features/.gitkeep

@@ -0,0 +1 @@
+

BIN
index-rotation/data/features/chinext50/daily.parquet


BIN
index-rotation/data/features/hs300/daily.parquet


BIN
index-rotation/data/features/sse50/daily.parquet


BIN
index-rotation/data/features/star50/daily.parquet


+ 12 - 0
index-rotation/data/meta/fetch_log.jsonl

@@ -0,0 +1,12 @@
+{"timestamp": "2026-04-06T11:46:47+00:00", "instrument": "sse50", "operation": "bootstrap", "layer": "raw", "requested_start": "2003-12-31", "requested_end": "2026-04-06", "rows_after_merge": 5403, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw", "fetched_rows": 5403}
+{"timestamp": "2026-04-06T11:46:47+00:00", "instrument": "sse50", "operation": "bootstrap", "layer": "clean", "requested_start": "2003-12-31", "requested_end": "2026-04-06", "rows_after_merge": 5403, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:47+00:00", "instrument": "sse50", "operation": "bootstrap", "layer": "features", "requested_start": "2003-12-31", "requested_end": "2026-04-06", "rows_after_merge": 5403, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:49+00:00", "instrument": "hs300", "operation": "bootstrap", "layer": "raw", "requested_start": "2004-12-31", "requested_end": "2026-04-06", "rows_after_merge": 5160, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw", "fetched_rows": 5160}
+{"timestamp": "2026-04-06T11:46:49+00:00", "instrument": "hs300", "operation": "bootstrap", "layer": "clean", "requested_start": "2004-12-31", "requested_end": "2026-04-06", "rows_after_merge": 5160, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:49+00:00", "instrument": "hs300", "operation": "bootstrap", "layer": "features", "requested_start": "2004-12-31", "requested_end": "2026-04-06", "rows_after_merge": 5160, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:50+00:00", "instrument": "chinext50", "operation": "bootstrap", "layer": "raw", "requested_start": "2010-05-31", "requested_end": "2026-04-06", "rows_after_merge": 2868, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw", "fetched_rows": 2868}
+{"timestamp": "2026-04-06T11:46:50+00:00", "instrument": "chinext50", "operation": "bootstrap", "layer": "clean", "requested_start": "2010-05-31", "requested_end": "2026-04-06", "rows_after_merge": 2868, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:50+00:00", "instrument": "chinext50", "operation": "bootstrap", "layer": "features", "requested_start": "2010-05-31", "requested_end": "2026-04-06", "rows_after_merge": 2868, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:51+00:00", "instrument": "star50", "operation": "bootstrap", "layer": "raw", "requested_start": "2019-12-31", "requested_end": "2026-04-06", "rows_after_merge": 1515, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw", "fetched_rows": 1515}
+{"timestamp": "2026-04-06T11:46:51+00:00", "instrument": "star50", "operation": "bootstrap", "layer": "clean", "requested_start": "2019-12-31", "requested_end": "2026-04-06", "rows_after_merge": 1515, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}
+{"timestamp": "2026-04-06T11:46:51+00:00", "instrument": "star50", "operation": "bootstrap", "layer": "features", "requested_start": "2019-12-31", "requested_end": "2026-04-06", "rows_after_merge": 1515, "status": "success", "provider": "akshare_eastmoney", "trigger_layer": "raw"}

+ 158 - 0
index-rotation/data/meta/manifest.json

@@ -0,0 +1,158 @@
+{
+  "generated_at": "2026-04-06T11:46:51+00:00",
+  "provider": "akshare_eastmoney",
+  "instruments": {
+    "sse50": {
+      "name": "上证50",
+      "index_code": "000016",
+      "provider_symbol": "sh000016",
+      "price_type": "price_index",
+      "configured_start": "2003-12-31",
+      "requested_start": "2003-12-31",
+      "actual_start": "2004-01-02",
+      "last_operation": "bootstrap",
+      "last_fetch_at": "2026-04-06T11:46:47+00:00",
+      "layers": {
+        "raw": {
+          "path": "data/raw/sse50/price.parquet",
+          "rows": 5403,
+          "updated_at": "2026-04-06T11:46:47+00:00",
+          "start_date": "2004-01-02",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 274695
+        },
+        "clean": {
+          "path": "data/clean/sse50/daily.parquet",
+          "rows": 5403,
+          "updated_at": "2026-04-06T11:46:47+00:00",
+          "start_date": "2004-01-02",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 396092
+        },
+        "features": {
+          "path": "data/features/sse50/daily.parquet",
+          "rows": 5403,
+          "updated_at": "2026-04-06T11:46:47+00:00",
+          "start_date": "2004-01-02",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 777113
+        }
+      }
+    },
+    "hs300": {
+      "name": "沪深300",
+      "index_code": "000300",
+      "provider_symbol": "sh000300",
+      "price_type": "price_index",
+      "configured_start": "2004-12-31",
+      "requested_start": "2004-12-31",
+      "actual_start": "2005-01-04",
+      "last_operation": "bootstrap",
+      "last_fetch_at": "2026-04-06T11:46:49+00:00",
+      "layers": {
+        "raw": {
+          "path": "data/raw/hs300/price.parquet",
+          "rows": 5160,
+          "updated_at": "2026-04-06T11:46:48+00:00",
+          "start_date": "2005-01-04",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 271235
+        },
+        "clean": {
+          "path": "data/clean/hs300/daily.parquet",
+          "rows": 5160,
+          "updated_at": "2026-04-06T11:46:48+00:00",
+          "start_date": "2005-01-04",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 389792
+        },
+        "features": {
+          "path": "data/features/hs300/daily.parquet",
+          "rows": 5160,
+          "updated_at": "2026-04-06T11:46:49+00:00",
+          "start_date": "2005-01-04",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 744689
+        }
+      }
+    },
+    "chinext50": {
+      "name": "创业板50",
+      "index_code": "399673",
+      "provider_symbol": "sz399673",
+      "price_type": "price_index",
+      "configured_start": "2010-05-31",
+      "requested_start": "2010-05-31",
+      "actual_start": "2014-06-18",
+      "last_operation": "bootstrap",
+      "last_fetch_at": "2026-04-06T11:46:50+00:00",
+      "layers": {
+        "raw": {
+          "path": "data/raw/chinext50/price.parquet",
+          "rows": 2868,
+          "updated_at": "2026-04-06T11:46:50+00:00",
+          "start_date": "2014-06-18",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 147073
+        },
+        "clean": {
+          "path": "data/clean/chinext50/daily.parquet",
+          "rows": 2868,
+          "updated_at": "2026-04-06T11:46:50+00:00",
+          "start_date": "2014-06-18",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 212865
+        },
+        "features": {
+          "path": "data/features/chinext50/daily.parquet",
+          "rows": 2868,
+          "updated_at": "2026-04-06T11:46:50+00:00",
+          "start_date": "2014-06-18",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 412814
+        }
+      }
+    },
+    "star50": {
+      "name": "科创50",
+      "index_code": "000688",
+      "provider_symbol": "sh000688",
+      "price_type": "price_index",
+      "configured_start": "2019-12-31",
+      "requested_start": "2019-12-31",
+      "actual_start": "2019-12-31",
+      "last_operation": "bootstrap",
+      "last_fetch_at": "2026-04-06T11:46:51+00:00",
+      "layers": {
+        "raw": {
+          "path": "data/raw/star50/price.parquet",
+          "rows": 1515,
+          "updated_at": "2026-04-06T11:46:51+00:00",
+          "start_date": "2019-12-31",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 78055
+        },
+        "clean": {
+          "path": "data/clean/star50/daily.parquet",
+          "rows": 1515,
+          "updated_at": "2026-04-06T11:46:51+00:00",
+          "start_date": "2019-12-31",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 113052
+        },
+        "features": {
+          "path": "data/features/star50/daily.parquet",
+          "rows": 1515,
+          "updated_at": "2026-04-06T11:46:51+00:00",
+          "start_date": "2019-12-31",
+          "end_date": "2026-04-03",
+          "file_size_bytes": 217544
+        }
+      }
+    }
+  },
+  "common_sample": {
+    "start_date": "2019-12-31",
+    "end_date": "2026-04-03"
+  }
+}

+ 1 - 0
index-rotation/data/raw/.gitkeep

@@ -0,0 +1 @@
+

BIN
index-rotation/data/raw/chinext50/price.parquet


BIN
index-rotation/data/raw/hs300/price.parquet


BIN
index-rotation/data/raw/sse50/price.parquet


BIN
index-rotation/data/raw/star50/price.parquet


+ 11 - 0
index-rotation/outputs/backtests/top2_every_5_days/summary.json

@@ -0,0 +1,11 @@
+{
+  "cumulative_return": 0.35112748563653895,
+  "annual_return": 0.05133128315733071,
+  "max_drawdown": -0.40054879649731157,
+  "annual_volatility": 0.21732260741906997,
+  "sharpe": 0.23619854265022228,
+  "calmar": 0.12815238394474926,
+  "turnover": 106.91513242295126,
+  "rebalance_count": 303,
+  "cash_days_ratio": 0.28052805280528054
+}

+ 11 - 0
index-rotation/outputs/backtests/top2_weekly/summary.json

@@ -0,0 +1,11 @@
+{
+  "cumulative_return": 0.14272700647425873,
+  "annual_return": 0.02244029968816963,
+  "max_drawdown": -0.4194736830144995,
+  "annual_volatility": 0.2204480881486043,
+  "sharpe": 0.10179403176788994,
+  "calmar": 0.05349632312307411,
+  "turnover": 110.96558288782578,
+  "rebalance_count": 319,
+  "cash_days_ratio": 0.25610561056105613
+}

+ 26 - 0
index-rotation/pyproject.toml

@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "index-rotation-data-layer"
+version = "0.1.0"
+description = "A-share index rotation v1 data layer"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "akshare>=1.18.49",
+    "pandas>=2.3.0",
+    "PyYAML>=5.4.1",
+    "pyarrow>=12.0.0",
+]
+
+[tool.setuptools]
+packages = [
+    "src",
+    "src.data",
+    "src.data.providers",
+    "src.signals",
+    "src.portfolio",
+    "src.backtest",
+]

+ 1 - 0
index-rotation/src/__init__.py

@@ -0,0 +1 @@
+"""Top-level package for the index rotation project."""

+ 3 - 0
index-rotation/src/backtest/__init__.py

@@ -0,0 +1,3 @@
+from src.backtest.engine import BacktestConfig, load_feature_panel, run_backtest
+
+__all__ = ["BacktestConfig", "load_feature_panel", "run_backtest"]

+ 170 - 0
index-rotation/src/backtest/engine.py

@@ -0,0 +1,170 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import pandas as pd
+
+from src.data.config import load_instruments
+from src.portfolio.allocator import allocate_weights
+from src.portfolio.rebalance import build_rebalance_plan
+from src.signals.selector import build_signal_panel
+from src.backtest.execution import (
+    CASH_COLUMN,
+    calculate_trading_cost,
+    calculate_turnover,
+    target_weights_from_group,
+    weights_from_values,
+)
+from src.backtest.metrics import compute_performance_metrics
+
+FEATURE_COLUMNS = [
+    "instrument",
+    "trade_date",
+    "close",
+    "daily_return",
+    "ret_5d",
+    "ret_10d",
+    "ret_20d",
+    "ret_60d",
+    "ma_20",
+    "ma_60",
+    "vol_10d",
+    "vol_20d",
+]
+
+
+@dataclass(frozen=True)
+class BacktestConfig:
+    top_n: int
+    rebalance_frequency: str
+    commission_bps: float = 0.0
+    slippage_bps: float = 0.0
+    cash_return: float = 0.0
+    annualization: int = 252
+
+
+def load_feature_panel(
+    *,
+    data_root: Path,
+    instrument_config_path: Path,
+    start_date: str | None = None,
+    end_date: str | None = None,
+) -> pd.DataFrame:
+    instruments = load_instruments(instrument_config_path)
+    frames: list[pd.DataFrame] = []
+    for instrument_key in instruments:
+        path = data_root / "features" / instrument_key / "daily.parquet"
+        frame = pd.read_parquet(path, columns=FEATURE_COLUMNS)
+        frames.append(frame)
+
+    panel = pd.concat(frames, ignore_index=True)
+    panel["trade_date"] = pd.to_datetime(panel["trade_date"])
+    if start_date is not None:
+        panel = panel.loc[panel["trade_date"] >= pd.Timestamp(start_date)]
+    if end_date is not None:
+        panel = panel.loc[panel["trade_date"] <= pd.Timestamp(end_date)]
+
+    instrument_count = len(instruments)
+    counts = panel.groupby("trade_date")["instrument"].transform("nunique")
+    panel = panel.loc[counts == instrument_count].copy()
+    return panel.sort_values(["trade_date", "instrument"]).reset_index(drop=True)
+
+
+def run_backtest(features_panel: pd.DataFrame, config: BacktestConfig) -> dict[str, pd.DataFrame | dict[str, float]]:
+    trade_dates = pd.DatetimeIndex(pd.to_datetime(features_panel["trade_date"])).sort_values().unique()
+    instruments = sorted(features_panel["instrument"].unique().tolist())
+    returns = (
+        features_panel.pivot(index="trade_date", columns="instrument", values="daily_return")
+        .reindex(trade_dates)
+        .fillna(0.0)
+    )
+    closes = features_panel.pivot(index="trade_date", columns="instrument", values="close").reindex(trade_dates)
+
+    signal_panel = build_signal_panel(features_panel, top_n=config.top_n)
+    allocated = allocate_weights(signal_panel, top_n=config.top_n)
+    rebalance_plan = build_rebalance_plan(allocated, frequency=config.rebalance_frequency)
+
+    current_values = pd.Series(0.0, index=[*instruments, CASH_COLUMN], dtype=float)
+    current_values[CASH_COLUMN] = 1.0
+    rebalance_dates = set(pd.to_datetime(rebalance_plan["execution_date"]).tolist())
+
+    nav_records: list[dict[str, float | pd.Timestamp]] = []
+    holding_records: list[dict[str, float | pd.Timestamp | str]] = []
+    rebalance_records: list[dict[str, float | pd.Timestamp | str]] = []
+    previous_nav = 1.0
+
+    for trade_date in trade_dates:
+        trade_returns = returns.loc[trade_date]
+        current_values.loc[instruments] = current_values.loc[instruments] * (1.0 + trade_returns)
+        current_values.loc[CASH_COLUMN] = current_values.loc[CASH_COLUMN] * (1.0 + config.cash_return)
+
+        nav_before_rebalance = float(current_values.sum())
+        weights_before_rebalance = weights_from_values(current_values)
+        trading_cost = 0.0
+        turnover = 0.0
+
+        if trade_date in rebalance_dates:
+            group = rebalance_plan.loc[rebalance_plan["execution_date"] == trade_date].copy()
+            target_weights = target_weights_from_group(group, instruments)
+            turnover = calculate_turnover(weights_before_rebalance, target_weights)
+            trading_cost = nav_before_rebalance * calculate_trading_cost(
+                turnover,
+                commission_bps=config.commission_bps,
+                slippage_bps=config.slippage_bps,
+            )
+            nav_after_cost = nav_before_rebalance - trading_cost
+            current_values = target_weights * nav_after_cost
+            end_weights = target_weights
+
+            for row in group.itertuples():
+                rebalance_records.append(
+                    {
+                        "signal_date": row.signal_date,
+                        "execution_date": row.execution_date,
+                        "instrument": row.instrument,
+                        "target_weight": float(row.target_weight),
+                        "cash_weight": float(row.cash_weight),
+                        "eligible_count": int(row.eligible_count),
+                        "selected_count": int(row.selected_count),
+                        "selection_rank": None if pd.isna(row.selection_rank) else int(row.selection_rank),
+                        "final_score": None if pd.isna(row.final_score) else float(row.final_score),
+                        "turnover": float(turnover),
+                        "trading_cost": float(trading_cost),
+                    }
+                )
+        else:
+            end_weights = weights_before_rebalance
+
+        nav_after_rebalance = float(current_values.sum())
+        portfolio_return = nav_after_rebalance / previous_nav - 1.0
+        nav_records.append(
+            {
+                "trade_date": trade_date,
+                "nav": nav_after_rebalance,
+                "portfolio_return": portfolio_return,
+                "cash_weight": float(end_weights[CASH_COLUMN]),
+            }
+        )
+        for instrument in instruments:
+            holding_records.append(
+                {
+                    "trade_date": trade_date,
+                    "instrument": instrument,
+                    "weight": float(end_weights[instrument]),
+                    "close": float(closes.loc[trade_date, instrument]),
+                }
+            )
+        previous_nav = nav_after_rebalance
+
+    daily_nav = pd.DataFrame(nav_records)
+    daily_holdings = pd.DataFrame(holding_records)
+    rebalances = pd.DataFrame(rebalance_records)
+    metrics = compute_performance_metrics(daily_nav, rebalances, annualization=config.annualization)
+    return {
+        "summary": metrics,
+        "daily_nav": daily_nav,
+        "daily_holdings": daily_holdings,
+        "rebalances": rebalances,
+        "signals": allocated,
+    }

+ 30 - 0
index-rotation/src/backtest/execution.py

@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+import pandas as pd
+
+CASH_COLUMN = "__cash__"
+
+
+def target_weights_from_group(group: pd.DataFrame, instruments: list[str]) -> pd.Series:
+    weights = pd.Series(0.0, index=[*instruments, CASH_COLUMN], dtype=float)
+    for row in group.itertuples():
+        weights[row.instrument] = float(row.target_weight)
+    if not group.empty:
+        weights[CASH_COLUMN] = float(group["cash_weight"].iloc[0])
+    return weights
+
+
+def weights_from_values(values: pd.Series) -> pd.Series:
+    total = float(values.sum())
+    if total == 0:
+        return values * 0.0
+    return values / total
+
+
+def calculate_turnover(current_weights: pd.Series, target_weights: pd.Series) -> float:
+    aligned = pd.concat([current_weights, target_weights], axis=1).fillna(0.0)
+    return 0.5 * float((aligned.iloc[:, 0] - aligned.iloc[:, 1]).abs().sum())
+
+
+def calculate_trading_cost(turnover: float, *, commission_bps: float, slippage_bps: float) -> float:
+    return turnover * (commission_bps + slippage_bps) / 10000.0

+ 49 - 0
index-rotation/src/backtest/metrics.py

@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import math
+
+import pandas as pd
+
+
+def compute_performance_metrics(
+    daily_nav: pd.DataFrame,
+    rebalances: pd.DataFrame,
+    *,
+    annualization: int = 252,
+) -> dict[str, float]:
+    nav = daily_nav["nav"].astype(float)
+    daily_returns = daily_nav["portfolio_return"].astype(float)
+    cumulative_return = float(nav.iloc[-1] - 1.0) if not nav.empty else 0.0
+
+    if len(daily_returns.index) > 0:
+        annual_return = float((nav.iloc[-1] ** (annualization / len(daily_returns.index))) - 1.0)
+        annual_volatility = float(daily_returns.std(ddof=0) * math.sqrt(annualization))
+    else:
+        annual_return = 0.0
+        annual_volatility = 0.0
+
+    running_max = nav.cummax()
+    drawdown = nav / running_max - 1.0
+    max_drawdown = float(drawdown.min()) if not drawdown.empty else 0.0
+    sharpe = annual_return / annual_volatility if annual_volatility > 0 else 0.0
+    calmar = annual_return / abs(max_drawdown) if max_drawdown < 0 else 0.0
+
+    if not rebalances.empty and {"execution_date", "turnover"}.issubset(rebalances.columns):
+        turnover = float(rebalances.groupby("execution_date")["turnover"].first().sum())
+        rebalance_count = int(rebalances["execution_date"].nunique())
+    else:
+        turnover = 0.0
+        rebalance_count = 0
+    cash_days_ratio = float((daily_nav["cash_weight"] > 0.999999).mean()) if not daily_nav.empty else 0.0
+
+    return {
+        "cumulative_return": cumulative_return,
+        "annual_return": annual_return,
+        "max_drawdown": max_drawdown,
+        "annual_volatility": annual_volatility,
+        "sharpe": float(sharpe),
+        "calmar": float(calmar),
+        "turnover": turnover,
+        "rebalance_count": rebalance_count,
+        "cash_days_ratio": cash_days_ratio,
+    }

+ 114 - 0
index-rotation/src/backtest/run.py

@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from src.backtest.engine import BacktestConfig, load_feature_panel, run_backtest
+
+
+def repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Run the phase2 index-rotation backtest.")
+    parser.add_argument("--config", type=Path, required=True, help="Path to a strategy config YAML file.")
+    parser.add_argument("--data-root", type=Path, default=None, help="Path to the local data directory.")
+    parser.add_argument(
+        "--instrument-config",
+        type=Path,
+        default=None,
+        help="Path to configs/instruments.yaml",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Directory used to save summary/nav/holdings/rebalances outputs.",
+    )
+    return parser
+
+
+def load_strategy_config(path: Path) -> dict[str, Any]:
+    with path.open("r", encoding="utf-8") as handle:
+        payload = yaml.safe_load(handle) or {}
+    return payload
+
+
+def build_backtest_config(payload: dict[str, Any]) -> tuple[str, BacktestConfig, str | None, str | None]:
+    name = str(payload.get("name") or "index_rotation_backtest")
+    return (
+        name,
+        BacktestConfig(
+            top_n=int(payload["top_n"]),
+            rebalance_frequency=str(payload["rebalance_frequency"]),
+            commission_bps=float(payload.get("commission_bps", 0.0)),
+            slippage_bps=float(payload.get("slippage_bps", 0.0)),
+            cash_return=float(payload.get("cash_return", 0.0)),
+        ),
+        payload.get("start_date"),
+        payload.get("end_date"),
+    )
+
+
+def save_outputs(result: dict[str, Any], *, output_dir: Path) -> dict[str, str]:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    summary_path = output_dir / "summary.json"
+    nav_path = output_dir / "daily_nav.csv"
+    holdings_path = output_dir / "daily_holdings.csv"
+    rebalances_path = output_dir / "rebalances.csv"
+
+    summary_path.write_text(
+        json.dumps(result["summary"], ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    result["daily_nav"].to_csv(nav_path, index=False)
+    result["daily_holdings"].to_csv(holdings_path, index=False)
+    result["rebalances"].to_csv(rebalances_path, index=False)
+    return {
+        "summary": str(summary_path),
+        "daily_nav": str(nav_path),
+        "daily_holdings": str(holdings_path),
+        "rebalances": str(rebalances_path),
+    }
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    root = repo_root()
+    payload = load_strategy_config(args.config)
+    name, config, start_date, end_date = build_backtest_config(payload)
+    data_root = args.data_root or root / "data"
+    instrument_config = args.instrument_config or root / "configs" / "instruments.yaml"
+    features_panel = load_feature_panel(
+        data_root=data_root,
+        instrument_config_path=instrument_config,
+        start_date=start_date,
+        end_date=end_date,
+    )
+    result = run_backtest(features_panel, config)
+    target_dir = (args.output_dir or root / "outputs" / "backtests") / name
+    output_paths = save_outputs(result, output_dir=target_dir)
+    print(
+        json.dumps(
+            {
+                "config": asdict(config),
+                "summary": result["summary"],
+                "output_paths": output_paths,
+            },
+            ensure_ascii=False,
+            indent=2,
+            default=str,
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 1 - 0
index-rotation/src/data/__init__.py

@@ -0,0 +1 @@
+"""Data layer package for the index rotation project."""

+ 26 - 0
index-rotation/src/data/backfill.py

@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import argparse
+
+from src.data.cli_common import build_base_parser, build_pipeline, emit_json, handle_cli_error, parse_iso_date
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = build_base_parser("Backfill a single instrument from the requested start date.")
+    parser.add_argument("--instrument", required=True, help="Instrument key in configs/instruments.yaml")
+    parser.add_argument("--start", required=True, type=parse_iso_date, help="ISO date, e.g. 2019-12-31")
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    try:
+        args = build_parser().parse_args(argv)
+        pipeline = build_pipeline(args)
+        emit_json(pipeline.backfill(args.instrument, args.start))
+        return 0
+    except Exception as exc:
+        return handle_cli_error(exc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 28 - 0
index-rotation/src/data/bootstrap.py

@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import argparse
+
+from src.data.cli_common import build_base_parser, build_pipeline, emit_json, handle_cli_error
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = build_base_parser("Bootstrap all configured instruments into raw/clean/features layers.")
+    parser.add_argument("--all", action="store_true", help="Bootstrap all configured instruments.")
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    try:
+        parser = build_parser()
+        args = parser.parse_args(argv)
+        if not args.all:
+            parser.error("--all is required")
+        pipeline = build_pipeline(args)
+        emit_json(pipeline.bootstrap_all())
+        return 0
+    except Exception as exc:
+        return handle_cli_error(exc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 46 - 0
index-rotation/src/data/cli_common.py

@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+from src.data.exceptions import DataLayerError
+from src.data.pipeline import DataPipeline
+
+
+def repo_root() -> Path:
+    return Path(__file__).resolve().parents[2]
+
+
+def build_base_parser(description: str) -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument("--config", type=Path, default=None, help="Path to configs/instruments.yaml")
+    parser.add_argument("--data-root", type=Path, default=None, help="Path to data directory")
+    return parser
+
+
+def build_pipeline(args: argparse.Namespace) -> DataPipeline:
+    root = repo_root()
+    return DataPipeline(
+        repo_root=root,
+        config_path=args.config,
+        data_root=args.data_root,
+    )
+
+
+def parse_iso_date(value: str) -> date:
+    return date.fromisoformat(value)
+
+
+def emit_json(payload: Any) -> None:
+    print(json.dumps(payload, ensure_ascii=False, indent=2, default=str))
+
+
+def handle_cli_error(exc: Exception) -> int:
+    if isinstance(exc, DataLayerError):
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 1
+    raise exc

+ 34 - 0
index-rotation/src/data/config.py

@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from datetime import date
+from pathlib import Path
+
+import yaml
+
+from src.data.exceptions import UnknownInstrumentError
+from src.data.models import Instrument
+
+
+def load_instruments(config_path: Path) -> dict[str, Instrument]:
+    with config_path.open("r", encoding="utf-8") as handle:
+        payload = yaml.safe_load(handle) or {}
+
+    instruments: dict[str, Instrument] = {}
+    for key, raw in (payload.get("instruments") or {}).items():
+        instruments[key] = Instrument(
+            key=key,
+            name=raw["name"],
+            index_code=str(raw["index_code"]),
+            provider_symbol=str(raw["provider_symbol"]),
+            exchange=str(raw["exchange"]),
+            price_type=str(raw["price_type"]),
+            bootstrap_start=date.fromisoformat(str(raw["bootstrap_start"])),
+        )
+    return instruments
+
+
+def get_instrument(instruments: dict[str, Instrument], key: str) -> Instrument:
+    try:
+        return instruments[key]
+    except KeyError as exc:
+        raise UnknownInstrumentError(f"Unknown instrument: {key}") from exc

+ 18 - 0
index-rotation/src/data/exceptions.py

@@ -0,0 +1,18 @@
+class DataLayerError(Exception):
+    """Base exception for the data layer."""
+
+
+class DependencyError(DataLayerError):
+    """Raised when an optional runtime dependency is required but missing."""
+
+
+class ProviderError(DataLayerError):
+    """Raised when a provider request fails or returns unusable data."""
+
+
+class UnknownInstrumentError(DataLayerError):
+    """Raised when the requested instrument is not configured."""
+
+
+class MissingLayerError(DataLayerError):
+    """Raised when a repair operation cannot find its source layer."""

+ 99 - 0
index-rotation/src/data/metadata.py

@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from src.data.models import Instrument
+
+
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
+
+
+class MetadataStore:
+    def __init__(self, meta_root: Path) -> None:
+        self.meta_root = meta_root
+        self.manifest_path = meta_root / "manifest.json"
+        self.fetch_log_path = meta_root / "fetch_log.jsonl"
+
+    def ensure_layout(self) -> None:
+        self.meta_root.mkdir(parents=True, exist_ok=True)
+        if not self.manifest_path.exists():
+            self.save_manifest(empty_manifest())
+        if not self.fetch_log_path.exists():
+            self.fetch_log_path.touch()
+
+    def load_manifest(self) -> dict[str, Any]:
+        self.ensure_layout()
+        with self.manifest_path.open("r", encoding="utf-8") as handle:
+            return json.load(handle)
+
+    def save_manifest(self, manifest: dict[str, Any]) -> None:
+        self.meta_root.mkdir(parents=True, exist_ok=True)
+        with self.manifest_path.open("w", encoding="utf-8") as handle:
+            json.dump(manifest, handle, ensure_ascii=False, indent=2)
+
+    def append_fetch_log(self, payload: dict[str, Any]) -> None:
+        self.ensure_layout()
+        with self.fetch_log_path.open("a", encoding="utf-8") as handle:
+            handle.write(json.dumps(payload, ensure_ascii=False) + "\n")
+
+
+def empty_manifest() -> dict[str, Any]:
+    return {
+        "generated_at": None,
+        "provider": None,
+        "instruments": {},
+        "common_sample": {"start_date": None, "end_date": None},
+    }
+
+
+def update_manifest(
+    manifest: dict[str, Any],
+    *,
+    instrument: Instrument,
+    provider_name: str,
+    operation: str,
+    requested_start: str,
+    actual_start: str | None,
+    layer_summaries: dict[str, dict[str, Any]],
+) -> dict[str, Any]:
+    entry = manifest.setdefault("instruments", {}).setdefault(instrument.key, {})
+    entry.update(
+        {
+            "name": instrument.name,
+            "index_code": instrument.index_code,
+            "provider_symbol": instrument.provider_symbol,
+            "price_type": instrument.price_type,
+            "configured_start": instrument.bootstrap_start.isoformat(),
+            "requested_start": requested_start,
+            "actual_start": actual_start,
+            "last_operation": operation,
+            "last_fetch_at": utc_now_iso(),
+            "layers": layer_summaries,
+        }
+    )
+    manifest["provider"] = provider_name
+    manifest["generated_at"] = utc_now_iso()
+    manifest["common_sample"] = compute_common_sample(manifest)
+    return manifest
+
+
+def compute_common_sample(manifest: dict[str, Any]) -> dict[str, str | None]:
+    starts: list[str] = []
+    ends: list[str] = []
+    for entry in manifest.get("instruments", {}).values():
+        layer = entry.get("layers", {}).get("clean") or entry.get("layers", {}).get("raw")
+        if not layer:
+            return {"start_date": None, "end_date": None}
+        start_date = layer.get("start_date")
+        end_date = layer.get("end_date")
+        if not start_date or not end_date:
+            return {"start_date": None, "end_date": None}
+        starts.append(start_date)
+        ends.append(end_date)
+    if not starts or not ends:
+        return {"start_date": None, "end_date": None}
+    return {"start_date": max(starts), "end_date": min(ends)}

+ 18 - 0
index-rotation/src/data/models.py

@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import date
+from typing import Literal
+
+LayerName = Literal["raw", "clean", "features"]
+
+
+@dataclass(frozen=True)
+class Instrument:
+    key: str
+    name: str
+    index_code: str
+    provider_symbol: str
+    exchange: str
+    price_type: str
+    bootstrap_start: date

+ 313 - 0
index-rotation/src/data/pipeline.py

@@ -0,0 +1,313 @@
+from __future__ import annotations
+
+from datetime import date, timedelta
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+from src.data.config import get_instrument, load_instruments
+from src.data.exceptions import MissingLayerError, ProviderError
+from src.data.metadata import MetadataStore, update_manifest, utc_now_iso
+from src.data.models import Instrument, LayerName
+from src.data.providers import build_provider
+from src.data.providers.base import IndexPriceProvider
+from src.data.storage import DataLake, LocalParquetDataLake
+from src.data.transform import build_clean_frame, build_features_frame, build_raw_frame
+
+
+class DataPipeline:
+    def __init__(
+        self,
+        repo_root: Path,
+        *,
+        config_path: Path | None = None,
+        data_root: Path | None = None,
+        provider: IndexPriceProvider | None = None,
+        datalake: DataLake | None = None,
+        metadata_store: MetadataStore | None = None,
+    ) -> None:
+        self.repo_root = repo_root
+        self.config_path = config_path or repo_root / "configs" / "instruments.yaml"
+        self.data_root = data_root or repo_root / "data"
+        self.instruments = load_instruments(self.config_path)
+        self.provider = provider or build_provider("akshare")
+        self.datalake = datalake or LocalParquetDataLake(self.data_root)
+        self.metadata_store = metadata_store or MetadataStore(self.data_root / "meta")
+        self.datalake.ensure_layout()
+        self.metadata_store.ensure_layout()
+
+    def bootstrap_all(self, today: date | None = None) -> dict[str, Any]:
+        self.datalake.validate_runtime()
+        reference_date = today or date.today()
+        results: dict[str, Any] = {}
+        for instrument in self.instruments.values():
+            results[instrument.key] = self._refresh_from_provider(
+                instrument=instrument,
+                request_start=instrument.bootstrap_start,
+                request_end=reference_date,
+                operation="bootstrap",
+            )
+        return results
+
+    def update_since_last(self, today: date | None = None) -> dict[str, Any]:
+        self.datalake.validate_runtime()
+        reference_date = today or date.today()
+        results: dict[str, Any] = {}
+        for instrument in self.instruments.values():
+            existing_raw = self._read_existing("raw", instrument.key)
+            if existing_raw.empty:
+                request_start = instrument.bootstrap_start
+            else:
+                last_date = pd.to_datetime(existing_raw["trade_date"]).max().date()
+                request_start = last_date + timedelta(days=1)
+            results[instrument.key] = self._refresh_from_provider(
+                instrument=instrument,
+                request_start=request_start,
+                request_end=reference_date,
+                operation="update",
+            )
+        return results
+
+    def backfill(self, instrument_key: str, start_date: date, today: date | None = None) -> dict[str, Any]:
+        self.datalake.validate_runtime()
+        instrument = get_instrument(self.instruments, instrument_key)
+        reference_date = today or date.today()
+        return self._refresh_from_provider(
+            instrument=instrument,
+            request_start=start_date,
+            request_end=reference_date,
+            operation="backfill",
+        )
+
+    def repair(self, instrument_key: str, layer: LayerName) -> dict[str, Any]:
+        self.datalake.validate_runtime()
+        instrument = get_instrument(self.instruments, instrument_key)
+        if layer == "raw":
+            raise MissingLayerError("Raw layer cannot be repaired locally. Use bootstrap/update/backfill.")
+        raw_frame = self._read_existing("raw", instrument_key)
+        if raw_frame.empty:
+            raise MissingLayerError(f"Missing raw layer for {instrument_key}")
+
+        clean_frame = build_clean_frame(raw_frame, instrument)
+        clean_summary = self._write_layer("clean", instrument, clean_frame)
+        features_frame = build_features_frame(clean_frame)
+        feature_summary = self._write_layer("features", instrument, features_frame)
+        layer_summaries = {"clean": clean_summary, "features": feature_summary}
+
+        manifest = self.metadata_store.load_manifest()
+        updated_manifest = update_manifest(
+            manifest,
+            instrument=instrument,
+            provider_name=self.provider.name,
+            operation=f"repair_{layer}",
+            requested_start=instrument.bootstrap_start.isoformat(),
+            actual_start=self._frame_start_date(raw_frame),
+            layer_summaries={
+                "raw": self._existing_layer_summary(instrument, "raw", raw_frame),
+                **self._merge_existing_layer_summaries(instrument, layer_summaries),
+            },
+        )
+        self.metadata_store.save_manifest(updated_manifest)
+        self._append_layer_logs(
+            instrument=instrument,
+            operation="repair",
+            requested_start=clean_summary["start_date"] or instrument.bootstrap_start.isoformat(),
+            requested_end=feature_summary["end_date"] or clean_summary["end_date"],
+            fetched_rows=0,
+            layer_summaries=layer_summaries,
+            trigger_layer=layer,
+        )
+        return layer_summaries
+
+    def status_snapshot(self) -> dict[str, Any]:
+        return self.metadata_store.load_manifest()
+
+    def _refresh_from_provider(
+        self,
+        *,
+        instrument: Instrument,
+        request_start: date,
+        request_end: date,
+        operation: str,
+    ) -> dict[str, Any]:
+        existing_raw = self._read_existing("raw", instrument.key)
+        if request_start > request_end:
+            layer_summaries = self._materialize_local_layers(instrument, existing_raw)
+            self._persist_metadata(
+                instrument=instrument,
+                operation=operation,
+                requested_start=request_start,
+                requested_end=request_end,
+                fetched_rows=0,
+                layer_summaries=layer_summaries,
+            )
+            return layer_summaries
+
+        fetched = self.provider.fetch_price_history(instrument, request_start, request_end)
+        fetched_raw = build_raw_frame(fetched, instrument, self.provider.name)
+        if fetched_raw.empty and existing_raw.empty:
+            raise ProviderError(
+                f"No data returned for {instrument.key} between "
+                f"{request_start.isoformat()} and {request_end.isoformat()}"
+            )
+
+        merged_raw = merge_frames(existing_raw, fetched_raw)
+        layer_summaries = self._materialize_local_layers(instrument, merged_raw)
+        self._persist_metadata(
+            instrument=instrument,
+            operation=operation,
+            requested_start=request_start,
+            requested_end=request_end,
+            fetched_rows=len(fetched_raw),
+            layer_summaries=layer_summaries,
+        )
+        return layer_summaries
+
+    def _materialize_local_layers(
+        self,
+        instrument: Instrument,
+        raw_frame: pd.DataFrame,
+    ) -> dict[str, dict[str, Any]]:
+        raw_summary = self._write_layer("raw", instrument, raw_frame)
+        clean_frame = build_clean_frame(raw_frame, instrument)
+        clean_summary = self._write_layer("clean", instrument, clean_frame)
+        features_frame = build_features_frame(clean_frame)
+        feature_summary = self._write_layer("features", instrument, features_frame)
+        return {"raw": raw_summary, "clean": clean_summary, "features": feature_summary}
+
+    def _persist_metadata(
+        self,
+        *,
+        instrument: Instrument,
+        operation: str,
+        requested_start: date,
+        requested_end: date,
+        fetched_rows: int,
+        layer_summaries: dict[str, dict[str, Any]],
+    ) -> None:
+        manifest = self.metadata_store.load_manifest()
+        actual_start = layer_summaries["raw"]["start_date"]
+        updated_manifest = update_manifest(
+            manifest,
+            instrument=instrument,
+            provider_name=self.provider.name,
+            operation=operation,
+            requested_start=requested_start.isoformat(),
+            actual_start=actual_start,
+            layer_summaries=layer_summaries,
+        )
+        self.metadata_store.save_manifest(updated_manifest)
+        self._append_layer_logs(
+            instrument=instrument,
+            operation=operation,
+            requested_start=requested_start.isoformat(),
+            requested_end=requested_end.isoformat(),
+            fetched_rows=fetched_rows,
+            layer_summaries=layer_summaries,
+            trigger_layer="raw",
+        )
+
+    def _write_layer(
+        self,
+        layer: LayerName,
+        instrument: Instrument,
+        frame: pd.DataFrame,
+    ) -> dict[str, Any]:
+        path = self.datalake.write_layer(layer, instrument.key, frame)
+        return summarize_frame(self.repo_root, path, frame)
+
+    def _read_existing(self, layer: LayerName, instrument_key: str) -> pd.DataFrame:
+        if not self.datalake.exists(layer, instrument_key):
+            return pd.DataFrame()
+        return self.datalake.read_layer(layer, instrument_key)
+
+    def _existing_layer_summary(
+        self,
+        instrument: Instrument,
+        layer: LayerName,
+        frame: pd.DataFrame | None = None,
+    ) -> dict[str, Any]:
+        resolved_frame = frame if frame is not None else self._read_existing(layer, instrument.key)
+        return summarize_frame(self.repo_root, self.datalake.layer_path(layer, instrument.key), resolved_frame)
+
+    def _merge_existing_layer_summaries(
+        self,
+        instrument: Instrument,
+        updated_layers: dict[str, dict[str, Any]],
+    ) -> dict[str, dict[str, Any]]:
+        summaries = updated_layers.copy()
+        for layer in ("clean", "features"):
+            if layer not in summaries and self.datalake.exists(layer, instrument.key):
+                summaries[layer] = self._existing_layer_summary(instrument, layer)
+        return summaries
+
+    @staticmethod
+    def _frame_start_date(frame: pd.DataFrame) -> str | None:
+        if frame.empty:
+            return None
+        return pd.to_datetime(frame["trade_date"]).min().date().isoformat()
+
+    def _append_layer_logs(
+        self,
+        *,
+        instrument: Instrument,
+        operation: str,
+        requested_start: str,
+        requested_end: str | None,
+        fetched_rows: int,
+        layer_summaries: dict[str, dict[str, Any]],
+        trigger_layer: str,
+    ) -> None:
+        for layer_name, summary in layer_summaries.items():
+            payload = {
+                "timestamp": utc_now_iso(),
+                "instrument": instrument.key,
+                "operation": operation,
+                "layer": layer_name,
+                "requested_start": requested_start,
+                "requested_end": requested_end,
+                "rows_after_merge": summary["rows"],
+                "status": "success",
+                "provider": self.provider.name,
+                "trigger_layer": trigger_layer,
+            }
+            if layer_name == "raw":
+                payload["fetched_rows"] = fetched_rows
+            self.metadata_store.append_fetch_log(payload)
+
+
+def merge_frames(existing: pd.DataFrame, incoming: pd.DataFrame) -> pd.DataFrame:
+    if existing.empty:
+        return incoming.reset_index(drop=True)
+    if incoming.empty:
+        return existing.reset_index(drop=True)
+    merged = pd.concat([existing, incoming], ignore_index=True)
+    merged["trade_date"] = pd.to_datetime(merged["trade_date"], errors="coerce")
+    merged = merged.sort_values("trade_date").drop_duplicates("trade_date", keep="last")
+    return merged.reset_index(drop=True)
+
+
+def summarize_frame(repo_root: Path, path: Path, frame: pd.DataFrame) -> dict[str, Any]:
+    summary = {
+        "path": _path_for_manifest(repo_root, path),
+        "rows": int(len(frame.index)),
+        "updated_at": utc_now_iso(),
+        "start_date": None,
+        "end_date": None,
+    }
+    if not frame.empty and "trade_date" in frame.columns:
+        dates = pd.to_datetime(frame["trade_date"], errors="coerce").dropna()
+        if not dates.empty:
+            summary["start_date"] = dates.min().date().isoformat()
+            summary["end_date"] = dates.max().date().isoformat()
+    if path.exists():
+        summary["file_size_bytes"] = path.stat().st_size
+    return summary
+
+
+def _path_for_manifest(repo_root: Path, path: Path) -> str:
+    try:
+        return str(path.relative_to(repo_root))
+    except ValueError:
+        return str(path)

+ 10 - 0
index-rotation/src/data/providers/__init__.py

@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+from src.data.providers.akshare import AksharePriceIndexProvider
+from src.data.providers.base import IndexPriceProvider
+
+
+def build_provider(name: str) -> IndexPriceProvider:
+    if name == "akshare":
+        return AksharePriceIndexProvider()
+    raise ValueError(f"Unsupported provider: {name}")

+ 76 - 0
index-rotation/src/data/providers/akshare.py

@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import time
+from datetime import date
+
+import akshare as ak
+import pandas as pd
+
+from src.data.exceptions import ProviderError
+from src.data.models import Instrument
+from src.data.providers.base import IndexPriceProvider
+
+
+class AksharePriceIndexProvider(IndexPriceProvider):
+    name = "akshare_eastmoney"
+
+    def __init__(
+        self,
+        request_pause_seconds: float = 1.0,
+        max_retries: int = 3,
+        retry_backoff_seconds: float = 2.0,
+    ) -> None:
+        self.request_pause_seconds = request_pause_seconds
+        self.max_retries = max_retries
+        self.retry_backoff_seconds = retry_backoff_seconds
+        self._last_request_ts: float | None = None
+
+    def fetch_price_history(
+        self,
+        instrument: Instrument,
+        start_date: date,
+        end_date: date,
+    ) -> pd.DataFrame:
+        start = start_date.strftime("%Y%m%d")
+        end = end_date.strftime("%Y%m%d")
+        last_error: Exception | None = None
+
+        for attempt in range(1, self.max_retries + 1):
+            self._rate_limit()
+            try:
+                frame = ak.stock_zh_index_daily_em(
+                    symbol=instrument.provider_symbol,
+                    start_date=start,
+                    end_date=end,
+                )
+                self._last_request_ts = time.monotonic()
+                if frame.empty:
+                    return pd.DataFrame(
+                        columns=["trade_date", "open", "close", "high", "low", "volume", "amount"]
+                    )
+
+                frame = frame.rename(columns={"date": "trade_date"})
+                frame["trade_date"] = pd.to_datetime(frame["trade_date"], errors="coerce")
+                frame = frame[
+                    ["trade_date", "open", "close", "high", "low", "volume", "amount"]
+                ].copy()
+                frame = frame.sort_values("trade_date").drop_duplicates("trade_date", keep="last")
+                return frame.reset_index(drop=True)
+            except Exception as exc:  # pragma: no cover - exercised with fake provider in tests
+                last_error = exc
+                if attempt == self.max_retries:
+                    break
+                time.sleep(self.retry_backoff_seconds * attempt)
+
+        raise ProviderError(
+            f"Provider {self.name} failed for {instrument.key} "
+            f"between {start_date.isoformat()} and {end_date.isoformat()}: {last_error}"
+        )
+
+    def _rate_limit(self) -> None:
+        if self._last_request_ts is None:
+            return
+        elapsed = time.monotonic() - self._last_request_ts
+        remaining = self.request_pause_seconds - elapsed
+        if remaining > 0:
+            time.sleep(remaining)

+ 21 - 0
index-rotation/src/data/providers/base.py

@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from datetime import date
+
+import pandas as pd
+
+from src.data.models import Instrument
+
+
+class IndexPriceProvider(ABC):
+    name = "base"
+
+    @abstractmethod
+    def fetch_price_history(
+        self,
+        instrument: Instrument,
+        start_date: date,
+        end_date: date,
+    ) -> pd.DataFrame:
+        """Fetch raw price history for an index."""

+ 26 - 0
index-rotation/src/data/repair.py

@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import argparse
+
+from src.data.cli_common import build_base_parser, build_pipeline, emit_json, handle_cli_error
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = build_base_parser("Repair a downstream layer from local data only.")
+    parser.add_argument("--instrument", required=True, help="Instrument key in configs/instruments.yaml")
+    parser.add_argument("--layer", required=True, choices=["clean", "features"], help="Layer to repair")
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    try:
+        args = build_parser().parse_args(argv)
+        pipeline = build_pipeline(args)
+        emit_json(pipeline.repair(args.instrument, args.layer))
+        return 0
+    except Exception as exc:
+        return handle_cli_error(exc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 55 - 0
index-rotation/src/data/status.py

@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import argparse
+
+from src.data.cli_common import build_base_parser, build_pipeline, emit_json, handle_cli_error
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = build_base_parser("Show current local data layer status.")
+    parser.add_argument("--json", action="store_true", help="Emit manifest as JSON.")
+    return parser
+
+
+def _render_text(manifest: dict) -> str:
+    lines = [
+        f"provider: {manifest.get('provider')}",
+        f"generated_at: {manifest.get('generated_at')}",
+        "common_sample: "
+        f"{manifest.get('common_sample', {}).get('start_date')} -> "
+        f"{manifest.get('common_sample', {}).get('end_date')}",
+        "",
+    ]
+    instruments = manifest.get("instruments", {})
+    for key in sorted(instruments):
+        entry = instruments[key]
+        clean = entry.get("layers", {}).get("clean", {})
+        features = entry.get("layers", {}).get("features", {})
+        lines.extend(
+            [
+                f"[{key}] {entry.get('name')}",
+                f"  actual_start: {entry.get('actual_start')}",
+                f"  clean_last: {clean.get('end_date')}",
+                f"  features_last: {features.get('end_date')}",
+                "",
+            ]
+        )
+    return "\n".join(lines).rstrip()
+
+
+def main(argv: list[str] | None = None) -> int:
+    try:
+        args = build_parser().parse_args(argv)
+        pipeline = build_pipeline(args)
+        manifest = pipeline.status_snapshot()
+        if args.json:
+            emit_json(manifest)
+        else:
+            print(_render_text(manifest))
+        return 0
+    except Exception as exc:
+        return handle_cli_error(exc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 106 - 0
index-rotation/src/data/storage.py

@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Protocol
+
+import pandas as pd
+
+from src.data.exceptions import DependencyError
+from src.data.models import LayerName
+
+PARQUET_DEPENDENCY_MESSAGE = (
+    "Parquet support requires `pyarrow`. Install project dependencies before running CLI commands."
+)
+
+LAYER_FILENAMES = {
+    "raw": "price.parquet",
+    "clean": "daily.parquet",
+    "features": "daily.parquet",
+}
+
+
+class DataLake(Protocol):
+    def ensure_layout(self) -> None:
+        ...
+
+    def validate_runtime(self) -> None:
+        ...
+
+    def exists(self, layer: LayerName, instrument_key: str) -> bool:
+        ...
+
+    def read_layer(self, layer: LayerName, instrument_key: str) -> pd.DataFrame:
+        ...
+
+    def write_layer(self, layer: LayerName, instrument_key: str, frame: pd.DataFrame) -> Path:
+        ...
+
+    def layer_path(self, layer: LayerName, instrument_key: str) -> Path:
+        ...
+
+
+class LocalParquetDataLake:
+    def __init__(self, root: Path) -> None:
+        self.root = root
+
+    def ensure_layout(self) -> None:
+        for layer in ("raw", "clean", "features", "meta"):
+            (self.root / layer).mkdir(parents=True, exist_ok=True)
+
+    def validate_runtime(self) -> None:
+        self._require_parquet_engine()
+
+    def exists(self, layer: LayerName, instrument_key: str) -> bool:
+        return self.layer_path(layer, instrument_key).exists()
+
+    def read_layer(self, layer: LayerName, instrument_key: str) -> pd.DataFrame:
+        self._require_parquet_engine()
+        path = self.layer_path(layer, instrument_key)
+        if not path.exists():
+            return pd.DataFrame()
+        return pd.read_parquet(path)
+
+    def write_layer(self, layer: LayerName, instrument_key: str, frame: pd.DataFrame) -> Path:
+        self._require_parquet_engine()
+        path = self.layer_path(layer, instrument_key)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        frame.to_parquet(path, index=False)
+        return path
+
+    def layer_path(self, layer: LayerName, instrument_key: str) -> Path:
+        return self.root / layer / instrument_key / LAYER_FILENAMES[layer]
+
+    @staticmethod
+    def _require_parquet_engine() -> None:
+        try:
+            import pyarrow  # noqa: F401
+        except ImportError as exc:
+            raise DependencyError(PARQUET_DEPENDENCY_MESSAGE) from exc
+
+
+class InMemoryDataLake:
+    def __init__(self, root: Path) -> None:
+        self.root = root
+        self.frames: dict[tuple[str, str], pd.DataFrame] = {}
+
+    def ensure_layout(self) -> None:
+        self.root.mkdir(parents=True, exist_ok=True)
+
+    def validate_runtime(self) -> None:
+        return None
+
+    def exists(self, layer: LayerName, instrument_key: str) -> bool:
+        return (layer, instrument_key) in self.frames
+
+    def read_layer(self, layer: LayerName, instrument_key: str) -> pd.DataFrame:
+        frame = self.frames.get((layer, instrument_key))
+        if frame is None:
+            return pd.DataFrame()
+        return frame.copy()
+
+    def write_layer(self, layer: LayerName, instrument_key: str, frame: pd.DataFrame) -> Path:
+        self.frames[(layer, instrument_key)] = frame.copy()
+        return self.layer_path(layer, instrument_key)
+
+    def layer_path(self, layer: LayerName, instrument_key: str) -> Path:
+        return self.root / layer / instrument_key / LAYER_FILENAMES[layer]

+ 104 - 0
index-rotation/src/data/transform.py

@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import pandas as pd
+
+from src.data.models import Instrument
+
+RAW_COLUMNS = [
+    "instrument",
+    "instrument_name",
+    "index_code",
+    "provider",
+    "trade_date",
+    "open",
+    "high",
+    "low",
+    "close",
+    "volume",
+    "amount",
+]
+
+CLEAN_COLUMNS = [
+    "instrument",
+    "instrument_name",
+    "index_code",
+    "provider",
+    "price_type",
+    "trade_date",
+    "open",
+    "high",
+    "low",
+    "close",
+    "prev_close",
+    "change_amount",
+    "daily_return",
+    "volume",
+    "amount",
+]
+
+FEATURE_COLUMNS = [
+    "instrument",
+    "trade_date",
+    "close",
+    "daily_return",
+    "ret_1d",
+    "ret_5d",
+    "ret_10d",
+    "ret_20d",
+    "ret_60d",
+    "ma_5",
+    "ma_10",
+    "ma_20",
+    "ma_60",
+    "vol_10d",
+    "vol_20d",
+    "vol_60d",
+    "distance_to_ma_20",
+]
+
+
+def build_raw_frame(frame: pd.DataFrame, instrument: Instrument, provider_name: str) -> pd.DataFrame:
+    if frame.empty:
+        return pd.DataFrame(columns=RAW_COLUMNS)
+    raw = frame.copy()
+    raw["trade_date"] = pd.to_datetime(raw["trade_date"], errors="coerce")
+    raw["instrument"] = instrument.key
+    raw["instrument_name"] = instrument.name
+    raw["index_code"] = instrument.index_code
+    raw["provider"] = provider_name
+    raw = raw[["instrument", "instrument_name", "index_code", "provider", "trade_date", "open", "high", "low", "close", "volume", "amount"]]
+    raw = raw.sort_values("trade_date").drop_duplicates("trade_date", keep="last")
+    return raw.reset_index(drop=True)
+
+
+def build_clean_frame(raw_frame: pd.DataFrame, instrument: Instrument) -> pd.DataFrame:
+    if raw_frame.empty:
+        return pd.DataFrame(columns=CLEAN_COLUMNS)
+    clean = raw_frame.copy()
+    clean["trade_date"] = pd.to_datetime(clean["trade_date"], errors="coerce")
+    for column in ("open", "high", "low", "close", "volume", "amount"):
+        clean[column] = pd.to_numeric(clean[column], errors="coerce")
+    clean = clean.sort_values("trade_date").drop_duplicates("trade_date", keep="last")
+    clean["price_type"] = instrument.price_type
+    clean["prev_close"] = clean["close"].shift(1)
+    clean["change_amount"] = clean["close"] - clean["prev_close"]
+    clean["daily_return"] = clean["close"].pct_change()
+    clean = clean[CLEAN_COLUMNS]
+    return clean.reset_index(drop=True)
+
+
+def build_features_frame(clean_frame: pd.DataFrame) -> pd.DataFrame:
+    if clean_frame.empty:
+        return pd.DataFrame(columns=FEATURE_COLUMNS)
+    features = clean_frame.sort_values("trade_date").copy()
+    close = features["close"]
+    daily_return = features["daily_return"]
+    features["ret_1d"] = daily_return
+    for window in (5, 10, 20, 60):
+        features[f"ret_{window}d"] = close.pct_change(window)
+        features[f"ma_{window}"] = close.rolling(window, min_periods=window).mean()
+    for window in (10, 20, 60):
+        features[f"vol_{window}d"] = daily_return.rolling(window, min_periods=window).std(ddof=0)
+    features["distance_to_ma_20"] = close / features["ma_20"] - 1.0
+    features = features[FEATURE_COLUMNS]
+    return features.reset_index(drop=True)

+ 28 - 0
index-rotation/src/data/update.py

@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import argparse
+
+from src.data.cli_common import build_base_parser, build_pipeline, emit_json, handle_cli_error
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = build_base_parser("Incrementally update all configured instruments since the last local date.")
+    parser.add_argument("--since-last", action="store_true", help="Incrementally update from last local date.")
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    try:
+        parser = build_parser()
+        args = parser.parse_args(argv)
+        if not args.since_last:
+            parser.error("--since-last is required")
+        pipeline = build_pipeline(args)
+        emit_json(pipeline.update_since_last())
+        return 0
+    except Exception as exc:
+        return handle_cli_error(exc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 4 - 0
index-rotation/src/portfolio/__init__.py

@@ -0,0 +1,4 @@
+from src.portfolio.allocator import allocate_weights
+from src.portfolio.rebalance import build_rebalance_plan, generate_signal_dates
+
+__all__ = ["allocate_weights", "build_rebalance_plan", "generate_signal_dates"]

+ 18 - 0
index-rotation/src/portfolio/allocator.py

@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+import pandas as pd
+
+
+def allocate_weights(signal_panel: pd.DataFrame, *, top_n: int) -> pd.DataFrame:
+    if top_n < 1:
+        raise ValueError("top_n must be >= 1")
+
+    allocated = signal_panel.copy()
+    allocated["selected_count"] = allocated["eligible_count"].clip(upper=top_n)
+    allocated["target_weight"] = 0.0
+    selected_mask = allocated["selection_rank"].notna() & (
+        allocated["selection_rank"] <= allocated["selected_count"]
+    )
+    allocated.loc[selected_mask, "target_weight"] = 1.0 / allocated.loc[selected_mask, "selected_count"]
+    allocated["cash_weight"] = (allocated["selected_count"] == 0).astype(float)
+    return allocated

+ 37 - 0
index-rotation/src/portfolio/rebalance.py

@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import pandas as pd
+
+SUPPORTED_REBALANCE_FREQUENCIES = {"daily", "weekly", "every_5_days"}
+
+
+def generate_signal_dates(trade_dates: pd.Series | pd.Index, frequency: str) -> pd.DatetimeIndex:
+    if frequency not in SUPPORTED_REBALANCE_FREQUENCIES:
+        raise ValueError(f"Unsupported rebalance frequency: {frequency}")
+
+    index = pd.DatetimeIndex(pd.to_datetime(trade_dates)).sort_values().unique()
+    if len(index) < 2:
+        return pd.DatetimeIndex([])
+
+    if frequency == "daily":
+        signal_dates = index[:-1]
+    elif frequency == "weekly":
+        schedule = pd.DataFrame({"trade_date": index})
+        signal_dates = pd.DatetimeIndex(
+            schedule.groupby(schedule["trade_date"].dt.to_period("W-FRI"))["trade_date"].max().sort_values()
+        )
+    else:
+        signal_dates = index[::5]
+
+    return signal_dates[signal_dates < index[-1]]
+
+
+def build_rebalance_plan(signal_panel: pd.DataFrame, *, frequency: str) -> pd.DataFrame:
+    trade_dates = pd.DatetimeIndex(pd.to_datetime(signal_panel["trade_date"])).sort_values().unique()
+    signal_dates = generate_signal_dates(trade_dates, frequency)
+    next_trade_date = {trade_dates[idx]: trade_dates[idx + 1] for idx in range(len(trade_dates) - 1)}
+    plan = signal_panel.loc[signal_panel["trade_date"].isin(signal_dates)].copy()
+    plan["signal_date"] = plan["trade_date"]
+    plan["execution_date"] = plan["signal_date"].map(next_trade_date)
+    plan = plan.drop(columns=["trade_date"])
+    return plan.sort_values(["execution_date", "instrument"]).reset_index(drop=True)

+ 3 - 0
index-rotation/src/signals/__init__.py

@@ -0,0 +1,3 @@
+from src.signals.selector import build_signal_panel
+
+__all__ = ["build_signal_panel"]

+ 24 - 0
index-rotation/src/signals/ranker.py

@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from collections.abc import Mapping, Sequence
+
+import pandas as pd
+
+
+def add_cross_sectional_ranks(
+    frame: pd.DataFrame,
+    *,
+    columns: Sequence[str],
+    ascending_by_column: Mapping[str, bool] | None = None,
+    group_column: str = "trade_date",
+) -> pd.DataFrame:
+    ranked = frame.copy()
+    directions = ascending_by_column or {}
+    for column in columns:
+        ascending = directions.get(column, True)
+        ranked[f"{column}_rank"] = ranked.groupby(group_column)[column].rank(
+            method="average",
+            pct=True,
+            ascending=ascending,
+        )
+    return ranked

+ 35 - 0
index-rotation/src/signals/scorer.py

@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+import pandas as pd
+
+from src.signals.ranker import add_cross_sectional_ranks
+
+MOMENTUM_WEIGHTS = {
+    "ret_5d": 0.20,
+    "ret_10d": 0.25,
+    "ret_20d": 0.30,
+    "ret_60d": 0.25,
+}
+
+RISK_WEIGHTS = {
+    "vol_10d": 0.60,
+    "vol_20d": 0.40,
+}
+
+RISK_PENALTY_MULTIPLIER = 0.30
+
+
+def add_composite_scores(frame: pd.DataFrame) -> pd.DataFrame:
+    scored = add_cross_sectional_ranks(
+        frame,
+        columns=[*MOMENTUM_WEIGHTS.keys(), *RISK_WEIGHTS.keys()],
+        ascending_by_column={column: True for column in [*MOMENTUM_WEIGHTS.keys(), *RISK_WEIGHTS.keys()]},
+    )
+    scored["score_mom"] = 0.0
+    for column, weight in MOMENTUM_WEIGHTS.items():
+        scored["score_mom"] = scored["score_mom"] + scored[f"{column}_rank"] * weight
+    scored["score_risk_penalty"] = 0.0
+    for column, weight in RISK_WEIGHTS.items():
+        scored["score_risk_penalty"] = scored["score_risk_penalty"] + scored[f"{column}_rank"] * weight
+    scored["final_score"] = scored["score_mom"] - RISK_PENALTY_MULTIPLIER * scored["score_risk_penalty"]
+    return scored

+ 28 - 0
index-rotation/src/signals/selector.py

@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import pandas as pd
+
+from src.signals.scorer import add_composite_scores
+from src.signals.trend import apply_trend_filter
+
+
+def build_signal_panel(features_frame: pd.DataFrame, *, top_n: int) -> pd.DataFrame:
+    if top_n < 1:
+        raise ValueError("top_n must be >= 1")
+
+    signals = add_composite_scores(apply_trend_filter(features_frame))
+    signals = signals.sort_values(["trade_date", "instrument"]).reset_index(drop=True)
+    signals["eligible_for_selection"] = signals["trend_pass"] & signals["final_score"].notna()
+    signals["eligible_count"] = signals.groupby("trade_date")["eligible_for_selection"].transform("sum").astype(int)
+
+    eligible = signals.loc[signals["eligible_for_selection"]].copy()
+    eligible = eligible.sort_values(["trade_date", "final_score", "instrument"], ascending=[True, False, True])
+    eligible["selection_rank"] = eligible.groupby("trade_date").cumcount() + 1
+
+    merged = signals.merge(
+        eligible[["trade_date", "instrument", "selection_rank"]],
+        on=["trade_date", "instrument"],
+        how="left",
+    )
+    merged["selected"] = merged["selection_rank"].le(top_n).fillna(False)
+    return merged

+ 21 - 0
index-rotation/src/signals/trend.py

@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import pandas as pd
+
+TREND_RULE_COLUMNS = [
+    "rule_close_above_ma_20",
+    "rule_close_above_ma_60",
+    "rule_ma_20_above_ma_60",
+    "rule_ret_20d_positive",
+]
+
+
+def apply_trend_filter(frame: pd.DataFrame) -> pd.DataFrame:
+    trend = frame.copy()
+    trend["rule_close_above_ma_20"] = (trend["close"] > trend["ma_20"]).fillna(False)
+    trend["rule_close_above_ma_60"] = (trend["close"] > trend["ma_60"]).fillna(False)
+    trend["rule_ma_20_above_ma_60"] = (trend["ma_20"] > trend["ma_60"]).fillna(False)
+    trend["rule_ret_20d_positive"] = (trend["ret_20d"] > 0).fillna(False)
+    trend["trend_rule_count"] = trend[TREND_RULE_COLUMNS].sum(axis=1)
+    trend["trend_pass"] = trend["trend_rule_count"] >= 2
+    return trend

+ 54 - 0
index-rotation/tests/test_cli.py

@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import io
+import tempfile
+import unittest
+from contextlib import redirect_stdout
+from pathlib import Path
+from unittest.mock import patch
+
+from src.data import bootstrap, status
+
+
+class CliTests(unittest.TestCase):
+    def test_bootstrap_requires_all_flag(self) -> None:
+        with self.assertRaises(SystemExit):
+            bootstrap.main([])
+
+    def test_status_renders_manifest(self) -> None:
+        temp_dir = tempfile.TemporaryDirectory()
+        self.addCleanup(temp_dir.cleanup)
+        root = Path(temp_dir.name)
+        (root / "src" / "data").mkdir(parents=True, exist_ok=True)
+        manifest = {
+            "provider": "fake",
+            "generated_at": "2026-01-01T00:00:00+00:00",
+            "common_sample": {"start_date": "2020-01-01", "end_date": "2020-01-31"},
+            "instruments": {
+                "sse50": {
+                    "name": "上证50",
+                    "actual_start": "2020-01-01",
+                    "layers": {
+                        "clean": {"end_date": "2020-01-31"},
+                        "features": {"end_date": "2020-01-31"},
+                    },
+                }
+            },
+        }
+
+        class PipelineStub:
+            def status_snapshot(self):
+                return manifest
+
+        buf = io.StringIO()
+        with patch("src.data.status.build_pipeline", return_value=PipelineStub()):
+            with redirect_stdout(buf):
+                exit_code = status.main([])
+        output = buf.getvalue()
+        self.assertEqual(exit_code, 0)
+        self.assertIn("provider: fake", output)
+        self.assertIn("[sse50] 上证50", output)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 79 - 0
index-rotation/tests/test_phase2_backtest.py

@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import unittest
+
+import pandas as pd
+
+from src.backtest.engine import BacktestConfig, run_backtest
+from src.portfolio.rebalance import generate_signal_dates
+
+
+def make_backtest_input() -> pd.DataFrame:
+    dates = pd.date_range("2020-01-01", periods=4, freq="D")
+    rows: list[dict[str, object]] = []
+    daily_returns = {
+        "sse50": [0.0, 0.50, 0.10, 0.00],
+        "hs300": [0.0, 0.00, 0.00, 0.00],
+        "chinext50": [0.0, 0.00, 0.00, 0.00],
+        "star50": [0.0, 0.00, 0.00, 0.00],
+    }
+    momentum = {"sse50": 0.20, "hs300": 0.10, "chinext50": 0.05, "star50": 0.02}
+    for trade_date in dates:
+        for instrument in ["sse50", "hs300", "chinext50", "star50"]:
+            rows.append(
+                {
+                    "instrument": instrument,
+                    "trade_date": trade_date,
+                    "close": 100.0,
+                    "daily_return": daily_returns[instrument][(trade_date - dates[0]).days],
+                    "ret_5d": momentum[instrument],
+                    "ret_10d": momentum[instrument],
+                    "ret_20d": momentum[instrument],
+                    "ret_60d": momentum[instrument],
+                    "ma_20": 90.0,
+                    "ma_60": 80.0,
+                    "vol_10d": 0.01 if instrument == "sse50" else 0.02,
+                    "vol_20d": 0.01 if instrument == "sse50" else 0.02,
+                }
+            )
+    return pd.DataFrame(rows)
+
+
+class BacktestTests(unittest.TestCase):
+    def test_generate_signal_dates_supports_weekly_and_every_five_days(self) -> None:
+        trade_dates = pd.date_range("2020-01-01", periods=12, freq="B")
+        weekly = generate_signal_dates(trade_dates, "weekly")
+        every_five = generate_signal_dates(trade_dates, "every_5_days")
+        self.assertEqual(weekly.tolist(), [pd.Timestamp("2020-01-03"), pd.Timestamp("2020-01-10")])
+        self.assertEqual(every_five.tolist(), [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-08"), pd.Timestamp("2020-01-15")])
+
+    def test_signal_date_and_execution_date_are_separated(self) -> None:
+        result = run_backtest(
+            make_backtest_input(),
+            BacktestConfig(top_n=1, rebalance_frequency="daily"),
+        )
+        nav = result["daily_nav"].set_index("trade_date")
+        rebalances = result["rebalances"].drop_duplicates(subset=["execution_date"])
+
+        self.assertEqual(rebalances.iloc[0]["signal_date"], pd.Timestamp("2020-01-01"))
+        self.assertEqual(rebalances.iloc[0]["execution_date"], pd.Timestamp("2020-01-02"))
+        self.assertAlmostEqual(nav.loc[pd.Timestamp("2020-01-01"), "nav"], 1.0)
+        self.assertAlmostEqual(nav.loc[pd.Timestamp("2020-01-02"), "nav"], 1.0)
+        self.assertAlmostEqual(nav.loc[pd.Timestamp("2020-01-03"), "nav"], 1.1)
+
+    def test_backtest_outputs_holdings_and_basic_metrics(self) -> None:
+        result = run_backtest(
+            make_backtest_input(),
+            BacktestConfig(top_n=2, rebalance_frequency="daily"),
+        )
+        self.assertIn("summary", result)
+        self.assertIn("daily_nav", result)
+        self.assertIn("daily_holdings", result)
+        self.assertIn("rebalances", result)
+        self.assertGreaterEqual(result["summary"]["rebalance_count"], 1)
+        self.assertEqual(result["daily_holdings"]["trade_date"].nunique(), 4)
+        self.assertEqual(result["rebalances"]["execution_date"].nunique(), 3)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 115 - 0
index-rotation/tests/test_phase2_signals.py

@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+import unittest
+
+import pandas as pd
+
+from src.portfolio.allocator import allocate_weights
+from src.signals.selector import build_signal_panel
+from src.signals.trend import apply_trend_filter
+
+
+def make_signal_input(trade_date: str = "2020-01-10") -> pd.DataFrame:
+    return pd.DataFrame(
+        [
+            {
+                "instrument": "sse50",
+                "trade_date": pd.Timestamp(trade_date),
+                "close": 110,
+                "daily_return": 0.01,
+                "ret_5d": 0.08,
+                "ret_10d": 0.10,
+                "ret_20d": 0.12,
+                "ret_60d": 0.20,
+                "ma_20": 100,
+                "ma_60": 95,
+                "vol_10d": 0.10,
+                "vol_20d": 0.11,
+            },
+            {
+                "instrument": "hs300",
+                "trade_date": pd.Timestamp(trade_date),
+                "close": 108,
+                "daily_return": 0.01,
+                "ret_5d": 0.05,
+                "ret_10d": 0.06,
+                "ret_20d": 0.08,
+                "ret_60d": 0.12,
+                "ma_20": 102,
+                "ma_60": 101,
+                "vol_10d": 0.12,
+                "vol_20d": 0.13,
+            },
+            {
+                "instrument": "chinext50",
+                "trade_date": pd.Timestamp(trade_date),
+                "close": 96,
+                "daily_return": -0.01,
+                "ret_5d": -0.02,
+                "ret_10d": -0.01,
+                "ret_20d": -0.03,
+                "ret_60d": 0.02,
+                "ma_20": 98,
+                "ma_60": 100,
+                "vol_10d": 0.18,
+                "vol_20d": 0.20,
+            },
+            {
+                "instrument": "star50",
+                "trade_date": pd.Timestamp(trade_date),
+                "close": 102,
+                "daily_return": 0.00,
+                "ret_5d": 0.02,
+                "ret_10d": 0.03,
+                "ret_20d": 0.01,
+                "ret_60d": 0.04,
+                "ma_20": 101,
+                "ma_60": 103,
+                "vol_10d": 0.08,
+                "vol_20d": 0.09,
+            },
+        ]
+    )
+
+
+class SignalLayerTests(unittest.TestCase):
+    def test_trend_filter_requires_at_least_two_rules(self) -> None:
+        frame = apply_trend_filter(make_signal_input())
+        outcome = frame.set_index("instrument")["trend_pass"].to_dict()
+        self.assertEqual(outcome["sse50"], True)
+        self.assertEqual(outcome["hs300"], True)
+        self.assertEqual(outcome["star50"], True)
+        self.assertEqual(outcome["chinext50"], False)
+
+    def test_ranking_and_scoring_only_selects_trend_pass_members(self) -> None:
+        signals = build_signal_panel(make_signal_input(), top_n=2).set_index("instrument")
+        self.assertEqual(signals.loc["sse50", "selection_rank"], 1)
+        self.assertEqual(signals.loc["hs300", "selection_rank"], 2)
+        self.assertTrue(pd.isna(signals.loc["chinext50", "selection_rank"]))
+        self.assertGreater(signals.loc["sse50", "final_score"], signals.loc["hs300", "final_score"])
+
+    def test_top1_top2_and_empty_allocation(self) -> None:
+        base_signals = build_signal_panel(make_signal_input(), top_n=2)
+
+        top2 = allocate_weights(base_signals, top_n=2)
+        top2_weights = top2.set_index("instrument")["target_weight"].to_dict()
+        self.assertEqual(top2_weights["sse50"], 0.5)
+        self.assertEqual(top2_weights["hs300"], 0.5)
+        self.assertEqual(top2["cash_weight"].iloc[0], 0.0)
+
+        top1 = allocate_weights(build_signal_panel(make_signal_input(), top_n=1), top_n=1)
+        top1_weights = top1.set_index("instrument")["target_weight"].to_dict()
+        self.assertEqual(top1_weights["sse50"], 1.0)
+        self.assertEqual(sum(top1_weights.values()), 1.0)
+
+        empty = make_signal_input()
+        empty[["close", "ret_20d"]] = [90, -0.10]
+        empty["ma_20"] = 100
+        empty["ma_60"] = 110
+        allocated = allocate_weights(build_signal_panel(empty, top_n=2), top_n=2)
+        self.assertTrue((allocated["target_weight"] == 0.0).all())
+        self.assertEqual(allocated["cash_weight"].iloc[0], 1.0)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 199 - 0
index-rotation/tests/test_pipeline.py

@@ -0,0 +1,199 @@
+from __future__ import annotations
+
+import json
+import tempfile
+import unittest
+from datetime import date
+from pathlib import Path
+
+import pandas as pd
+
+from src.data.metadata import MetadataStore
+from src.data.pipeline import DataPipeline
+from src.data.providers.base import IndexPriceProvider
+from src.data.storage import InMemoryDataLake
+from src.data.transform import build_clean_frame, build_features_frame
+
+
+class FakeProvider(IndexPriceProvider):
+    name = "fake_provider"
+
+    def __init__(self, frames: dict[str, list[pd.DataFrame]]) -> None:
+        self.frames = frames
+        self.calls: list[tuple[str, date, date]] = []
+
+    def fetch_price_history(self, instrument, start_date, end_date) -> pd.DataFrame:
+        self.calls.append((instrument.key, start_date, end_date))
+        queue = self.frames.setdefault(instrument.key, [])
+        if not queue:
+            return pd.DataFrame(columns=["trade_date", "open", "close", "high", "low", "volume", "amount"])
+        return queue.pop(0).copy()
+
+
+def make_price_frame(start: str, closes: list[float]) -> pd.DataFrame:
+    dates = pd.date_range(start=start, periods=len(closes), freq="D")
+    frame = pd.DataFrame(
+        {
+            "trade_date": dates,
+            "open": closes,
+            "close": closes,
+            "high": [value + 1 for value in closes],
+            "low": [value - 1 for value in closes],
+            "volume": [1000 + idx for idx in range(len(closes))],
+            "amount": [100000 + idx for idx in range(len(closes))],
+        }
+    )
+    return frame
+
+
+class PipelineTests(unittest.TestCase):
+    def setUp(self) -> None:
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.addCleanup(self.temp_dir.cleanup)
+        self.root = Path(self.temp_dir.name)
+        (self.root / "configs").mkdir(parents=True, exist_ok=True)
+        (self.root / "data" / "meta").mkdir(parents=True, exist_ok=True)
+        self.config_path = self.root / "configs" / "instruments.yaml"
+        self.config_path.write_text(
+            "\n".join(
+                [
+                    "instruments:",
+                    "  sse50:",
+                    "    name: 上证50",
+                    "    index_code: \"000016\"",
+                    "    provider_symbol: sh000016",
+                    "    exchange: SSE",
+                    "    price_type: price_index",
+                    "    bootstrap_start: \"2003-12-31\"",
+                ]
+            ),
+            encoding="utf-8",
+        )
+
+    def test_features_do_not_change_when_future_rows_are_appended(self) -> None:
+        clean_a = pd.DataFrame(
+            {
+                "instrument": ["sse50"] * 25,
+                "instrument_name": ["上证50"] * 25,
+                "index_code": ["000016"] * 25,
+                "provider": ["fake"] * 25,
+                "price_type": ["price_index"] * 25,
+                "trade_date": pd.date_range("2020-01-01", periods=25, freq="D"),
+                "open": range(1, 26),
+                "high": range(2, 27),
+                "low": range(0, 25),
+                "close": range(1, 26),
+                "prev_close": [None] + list(range(1, 25)),
+                "change_amount": [None] + [1] * 24,
+                "daily_return": [None] + [1.0 / value for value in range(1, 25)],
+                "volume": [100] * 25,
+                "amount": [1000] * 25,
+            }
+        )
+        features_a = build_features_frame(clean_a)
+        features_b = build_features_frame(pd.concat([clean_a, clean_a.tail(1).assign(trade_date=pd.Timestamp("2020-01-26"), close=26, open=26, high=27, low=25, prev_close=25, change_amount=1, daily_return=0.04)], ignore_index=True))
+        pd.testing.assert_frame_equal(
+            features_a.iloc[:25].reset_index(drop=True),
+            features_b.iloc[:25].reset_index(drop=True),
+        )
+
+    def test_bootstrap_then_incremental_update_merges_raw_and_updates_manifest(self) -> None:
+        provider = FakeProvider(
+            {
+                "sse50": [
+                    make_price_frame("2020-01-01", [10, 11, 12]),
+                    make_price_frame("2020-01-04", [13, 14]),
+                ]
+            }
+        )
+        datalake = InMemoryDataLake(self.root / "memory")
+        metadata = MetadataStore(self.root / "data" / "meta")
+        pipeline = DataPipeline(
+            repo_root=self.root,
+            config_path=self.config_path,
+            data_root=self.root / "data",
+            provider=provider,
+            datalake=datalake,
+            metadata_store=metadata,
+        )
+
+        bootstrap = pipeline.bootstrap_all(today=date(2020, 1, 3))
+        self.assertEqual(bootstrap["sse50"]["raw"]["rows"], 3)
+
+        update = pipeline.update_since_last(today=date(2020, 1, 5))
+        self.assertEqual(update["sse50"]["raw"]["rows"], 5)
+
+        raw_frame = datalake.read_layer("raw", "sse50")
+        self.assertEqual(len(raw_frame.index), 5)
+        manifest = metadata.load_manifest()
+        self.assertEqual(manifest["instruments"]["sse50"]["actual_start"], "2020-01-01")
+        self.assertEqual(manifest["instruments"]["sse50"]["layers"]["features"]["end_date"], "2020-01-05")
+        fetch_log_lines = (self.root / "data" / "meta" / "fetch_log.jsonl").read_text(encoding="utf-8").strip().splitlines()
+        self.assertEqual(len(fetch_log_lines), 6)
+        latest_payload = json.loads(fetch_log_lines[-1])
+        self.assertEqual(latest_payload["layer"], "features")
+        self.assertEqual(latest_payload["operation"], "update")
+
+    def test_repair_features_uses_local_clean_only(self) -> None:
+        provider = FakeProvider({"sse50": [make_price_frame("2020-01-01", [10, 11, 12, 13, 14])]})
+        datalake = InMemoryDataLake(self.root / "memory")
+        metadata = MetadataStore(self.root / "data" / "meta")
+        pipeline = DataPipeline(
+            repo_root=self.root,
+            config_path=self.config_path,
+            data_root=self.root / "data",
+            provider=provider,
+            datalake=datalake,
+            metadata_store=metadata,
+        )
+        pipeline.bootstrap_all(today=date(2020, 1, 5))
+        datalake.write_layer("features", "sse50", pd.DataFrame({"broken": [1]}))
+
+        repaired = pipeline.repair("sse50", "features")
+        repaired_frame = datalake.read_layer("features", "sse50")
+        self.assertIn("ret_1d", repaired_frame.columns)
+        self.assertEqual(repaired["features"]["rows"], 5)
+        self.assertEqual(len(provider.calls), 1)
+
+    def test_repair_clean_rebuilds_features_as_downstream_dependency(self) -> None:
+        provider = FakeProvider({"sse50": [make_price_frame("2020-01-01", [10, 11, 12, 13, 14])]})
+        datalake = InMemoryDataLake(self.root / "memory")
+        metadata = MetadataStore(self.root / "data" / "meta")
+        pipeline = DataPipeline(
+            repo_root=self.root,
+            config_path=self.config_path,
+            data_root=self.root / "data",
+            provider=provider,
+            datalake=datalake,
+            metadata_store=metadata,
+        )
+        pipeline.bootstrap_all(today=date(2020, 1, 5))
+        datalake.write_layer("features", "sse50", pd.DataFrame({"broken": [1]}))
+
+        repaired = pipeline.repair("sse50", "clean")
+        repaired_frame = datalake.read_layer("features", "sse50")
+        self.assertIn("ma_5", repaired_frame.columns)
+        self.assertEqual(repaired["clean"]["rows"], 5)
+        self.assertEqual(repaired["features"]["rows"], 5)
+
+    def test_clean_layer_computes_prev_close_and_daily_return(self) -> None:
+        raw_frame = make_price_frame("2020-01-01", [10, 12, 18])
+        raw_frame["instrument"] = "sse50"
+        raw_frame["instrument_name"] = "上证50"
+        raw_frame["index_code"] = "000016"
+        raw_frame["provider"] = "fake"
+        raw_frame = raw_frame[
+            ["instrument", "instrument_name", "index_code", "provider", "trade_date", "open", "high", "low", "close", "volume", "amount"]
+        ]
+
+        class InstrumentStub:
+            price_type = "price_index"
+
+        clean = build_clean_frame(raw_frame, InstrumentStub())
+        self.assertTrue(pd.isna(clean.loc[0, "prev_close"]))
+        self.assertAlmostEqual(clean.loc[1, "daily_return"], 0.2)
+        self.assertAlmostEqual(clean.loc[2, "daily_return"], 0.5)
+
+
+if __name__ == "__main__":
+    unittest.main()