3 月之前 · 4983dba2ed
--- a/market-regime-identifier/cyb50_market_classifier.py
+++ b/market-regime-identifier/cyb50_market_classifier.py
@@ -0,0 +1,349 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+创业板50市场状态分类器 - 真实数据版
			
 
				+基于规则定义标签，使用有监督学习（Random Forest）
			
 
				+
			
 
				+数据源：akshare 创业板50指数 (sz399673)
			
 
				+标签定义基于真实价格行为规则
			
 
				+"""
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.model_selection import train_test_split, cross_val_score
			
 
				+from sklearn.metrics import classification_report, confusion_matrix
			
 
				+import baostock as bs
			
 
				+import warnings
			
 
				+warnings.filterwarnings('ignore')
			
 
				+
			
 
				+
			
 
				+def fetch_cyb50_data(start_date="2017-01-01", end_date="2025-12-31"):
			
 
				+    """获取创业板50真实历史数据"""
			
 
				+    print(f"获取创业板50数据 ({start_date} - {end_date})...")
			
 
				+    
			
 
				+    try:
			
 
				+        # 使用baostock
			
 
				+        lg = bs.login()
			
 
				+        if lg.error_code != '0':
			
 
				+            print(f"baostock登录失败: {lg.error_msg}")
			
 
				+            return None
			
 
				+        
			
 
				+        # 创业板50代码: sz.399673
			
 
				+        rs = bs.query_history_k_data_plus("sz.399673",
			
 
				+            "date,open,high,low,close,volume",
			
 
				+            start_date=start_date, end_date=end_date,
			
 
				+            frequency="d", adjustflag="3")
			
 
				+        
			
 
				+        data_list = []
			
 
				+        while (rs.error_code == '0') & rs.next():
			
 
				+            row = rs.get_row_data()
			
 
				+            if row[0]:
			
 
				+                data_list.append({
			
 
				+                    'date': row[0],
			
 
				+                    'open': float(row[1]) if row[1] else 0,
			
 
				+                    'high': float(row[2]) if row[2] else 0,
			
 
				+                    'low': float(row[3]) if row[3] else 0,
			
 
				+                    'close': float(row[4]) if row[4] else 0,
			
 
				+                    'volume': int(float(row[5])) if row[5] else 0
			
 
				+                })
			
 
				+        
			
 
				+        bs.logout()
			
 
				+        
			
 
				+        if not data_list:
			
 
				+            print("✗ 未获取到数据")
			
 
				+            return None
			
 
				+        
			
 
				+        df = pd.DataFrame(data_list)
			
 
				+        df['date'] = pd.to_datetime(df['date'])
			
 
				+        df = df.set_index('date').sort_index()
			
 
				+        df['return'] = df['close'].pct_change()
			
 
				+        
			
 
				+        print(f"✓ 获取成功: {len(df)}条数据")
			
 
				+        print(f"  日期范围: {df.index[0].date()} ~ {df.index[-1].date()}")
			
 
				+        print(f"  价格范围: {df['close'].min():.2f} ~ {df['close'].max():.2f}")
			
 
				+        
			
 
				+        return df[['open', 'high', 'low', 'close', 'volume', 'return']]
			
 
				+    
			
 
				+    except Exception as e:
			
 
				+        print(f"✗ 数据获取失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def calculate_features(df):
			
 
				+    """计算技术指标特征"""
			
 
				+    features = pd.DataFrame(index=df.index)
			
 
				+    
			
 
				+    # 价格特征
			
 
				+    features['close'] = df['close']
			
 
				+    
			
 
				+    # 1. 收益率特征
			
 
				+    features['ret_1d'] = df['return']
			
 
				+    features['ret_5d'] = df['close'].pct_change(5)
			
 
				+    features['ret_10d'] = df['close'].pct_change(10)
			
 
				+    features['ret_20d'] = df['close'].pct_change(20)
			
 
				+    
			
 
				+    # 2. 波动率特征
			
 
				+    features['volatility_5d'] = df['return'].rolling(5).std() * np.sqrt(252)
			
 
				+    features['volatility_20d'] = df['return'].rolling(20).std() * np.sqrt(252)
			
 
				+    features['volatility_ratio'] = features['volatility_5d'] / (features['volatility_20d'] + 1e-10)
			
 
				+    
			
 
				+    # 3. 动量特征
			
 
				+    features['momentum_10d'] = df['close'] / df['close'].shift(10) - 1
			
 
				+    features['momentum_20d'] = df['close'] / df['close'].shift(20) - 1
			
 
				+    
			
 
				+    # 4. 均线特征
			
 
				+    features['ma5'] = df['close'].rolling(5).mean()
			
 
				+    features['ma20'] = df['close'].rolling(20).mean()
			
 
				+    features['ma60'] = df['close'].rolling(60).mean()
			
 
				+    features['ma5_above_ma20'] = (features['ma5'] > features['ma20']).astype(int)
			
 
				+    features['price_above_ma20'] = (df['close'] > features['ma20']).astype(int)
			
 
				+    
			
 
				+    # 5. RSI
			
 
				+    delta = df['close'].diff()
			
 
				+    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
			
 
				+    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
			
 
				+    rs = gain / (loss + 1e-10)
			
 
				+    features['rsi_14'] = 100 - (100 / (1 + rs))
			
 
				+    
			
 
				+    # 6. MACD
			
 
				+    ema12 = df['close'].ewm(span=12).mean()
			
 
				+    ema26 = df['close'].ewm(span=26).mean()
			
 
				+    features['macd'] = ema12 - ema26
			
 
				+    features['macd_signal'] = features['macd'].ewm(span=9).mean()
			
 
				+    features['macd_hist'] = features['macd'] - features['macd_signal']
			
 
				+    
			
 
				+    # 7. 布林带
			
 
				+    features['bb_middle'] = df['close'].rolling(20).mean()
			
 
				+    bb_std = df['close'].rolling(20).std()
			
 
				+    features['bb_upper'] = features['bb_middle'] + 2 * bb_std
			
 
				+    features['bb_lower'] = features['bb_middle'] - 2 * bb_std
			
 
				+    features['bb_position'] = (df['close'] - features['bb_lower']) / (features['bb_upper'] - features['bb_lower'] + 1e-10)
			
 
				+    
			
 
				+    # 8. ATR (平均真实波幅)
			
 
				+    high_low = df['high'] - df['low']
			
 
				+    high_close = np.abs(df['high'] - df['close'].shift())
			
 
				+    low_close = np.abs(df['low'] - df['close'].shift())
			
 
				+    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
			
 
				+    features['atr_14'] = tr.rolling(14).mean()
			
 
				+    features['atr_ratio'] = features['atr_14'] / df['close']
			
 
				+    
			
 
				+    # 9. 成交量特征
			
 
				+    features['volume_ratio'] = df['volume'] / df['volume'].rolling(20).mean()
			
 
				+    
			
 
				+    # 10. 趋势强度
			
 
				+    features['adx'] = calculate_adx(df, 14)
			
 
				+    
			
 
				+    # 填充缺失值
			
 
				+    features = features.ffill().fillna(0)
			
 
				+    
			
 
				+    return features
			
 
				+
			
 
				+
			
 
				+def calculate_adx(df, period=14):
			
 
				+    """计算ADX趋势强度指标"""
			
 
				+    plus_dm = df['high'].diff()
			
 
				+    minus_dm = df['low'].diff().abs()
			
 
				+    
			
 
				+    plus_dm[plus_dm < 0] = 0
			
 
				+    minus_dm[minus_dm < 0] = 0
			
 
				+    
			
 
				+    tr = pd.concat([
			
 
				+        df['high'] - df['low'],
			
 
				+        (df['high'] - df['close'].shift()).abs(),
			
 
				+        (df['low'] - df['close'].shift()).abs()
			
 
				+    ], axis=1).max(axis=1)
			
 
				+    
			
 
				+    atr = tr.rolling(period).mean()
			
 
				+    
			
 
				+    plus_di = 100 * (plus_dm.rolling(period).mean() / atr)
			
 
				+    minus_di = 100 * (minus_dm.rolling(period).mean() / atr)
			
 
				+    
			
 
				+    dx = (abs(plus_di - minus_di) / (plus_di + minus_di + 1e-10)) * 100
			
 
				+    adx = dx.rolling(period).mean()
			
 
				+    
			
 
				+    return adx
			
 
				+
			
 
				+
			
 
				+def define_market_regime(df, lookback=10):
			
 
				+    """
			
 
				+    基于规则定义市场状态标签
			
 
				+    
			
 
				+    规则：
			
 
				+    - 趋势上涨 (1): N日收益 > 5%, 且期间最高点/最低点 > 1.03
			
 
				+    - 趋势下跌 (1): N日收益 < -5%, 且期间最低点/最高点 < 0.97
			
 
				+    - 震荡 (0): 波动率在10%-25%之间，|N日收益| < 3%
			
 
				+    - 反转 (2): 前N/2日有明确趋势，后N/2日反向运动超过50%
			
 
				+    """
			
 
				+    labels = []
			
 
				+    
			
 
				+    for i in range(len(df)):
			
 
				+        if i < lookback:
			
 
				+            labels.append(0)  # 前期标记为震荡
			
 
				+            continue
			
 
				+        
			
 
				+        # 获取回看期间数据
			
 
				+        period_close = df['close'].iloc[i-lookback:i]
			
 
				+        period_high = df['high'].iloc[i-lookback:i]
			
 
				+        period_low = df['low'].iloc[i-lookback:i]
			
 
				+        
			
 
				+        start_price = period_close.iloc[0]
			
 
				+        end_price = period_close.iloc[-1]
			
 
				+        period_return = (end_price / start_price - 1) * 100
			
 
				+        
			
 
				+        # 计算期间波动
			
 
				+        volatility = period_close.pct_change().std() * np.sqrt(252) * 100
			
 
				+        
			
 
				+        # 判断趋势强度
			
 
				+        max_price = period_high.max()
			
 
				+        min_price = period_low.min()
			
 
				+        
			
 
				+        # 前半段和后半段
			
 
				+        mid = lookback // 2
			
 
				+        first_half_return = (period_close.iloc[mid] / start_price - 1) * 100
			
 
				+        second_half_return = (end_price / period_close.iloc[mid] - 1) * 100
			
 
				+        
			
 
				+        # 定义标签
			
 
				+        label = 0  # 默认震荡
			
 
				+        
			
 
				+        # 趋势判断
			
 
				+        if abs(period_return) > 5 and volatility < 35:
			
 
				+            if period_return > 0 and max_price / min_price > 1.05:
			
 
				+                label = 1  # 趋势上涨
			
 
				+            elif period_return < 0 and max_price / min_price > 1.05:
			
 
				+                label = 1  # 趋势下跌
			
 
				+        
			
 
				+        # 震荡判断
			
 
				+        elif abs(period_return) < 3 and 10 < volatility < 30:
			
 
				+            label = 0  # 震荡
			
 
				+        
			
 
				+        # 反转判断：前期有趋势，后期反向
			
 
				+        elif abs(first_half_return) > 3 and abs(second_half_return) > 2:
			
 
				+            if np.sign(first_half_return) != np.sign(second_half_return):
			
 
				+                label = 2  # 反转
			
 
				+        
			
 
				+        labels.append(label)
			
 
				+    
			
 
				+    return np.array(labels)
			
 
				+
			
 
				+
			
 
				+def train_classifier(features, labels):
			
 
				+    """训练随机森林分类器"""
			
 
				+    print("\n训练分类器...")
			
 
				+    
			
 
				+    # 对齐数据
			
 
				+    valid_idx = ~np.isnan(labels)
			
 
				+    X = features[valid_idx]
			
 
				+    y = labels[valid_idx]
			
 
				+    
			
 
				+    # 分割训练集和测试集（按时间顺序）
			
 
				+    split_idx = int(len(X) * 0.7)
			
 
				+    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
			
 
				+    y_train, y_test = y[:split_idx], y[split_idx:]
			
 
				+    
			
 
				+    print(f"训练集: {len(X_train)}条")
			
 
				+    print(f"测试集: {len(X_test)}条")
			
 
				+    
			
 
				+    # 训练模型
			
 
				+    clf = RandomForestClassifier(
			
 
				+        n_estimators=100,
			
 
				+        max_depth=10,
			
 
				+        min_samples_split=20,
			
 
				+        min_samples_leaf=10,
			
 
				+        random_state=42,
			
 
				+        class_weight='balanced'
			
 
				+    )
			
 
				+    
			
 
				+    clf.fit(X_train, y_train)
			
 
				+    
			
 
				+    # 评估
			
 
				+    train_score = clf.score(X_train, y_train)
			
 
				+    test_score = clf.score(X_test, y_test)
			
 
				+    
			
 
				+    print(f"\n训练准确率: {train_score:.2%}")
			
 
				+    print(f"测试准确率: {test_score:.2%}")
			
 
				+    
			
 
				+    # 交叉验证
			
 
				+    cv_scores = cross_val_score(clf, X, y, cv=5)
			
 
				+    print(f"交叉验证准确率: {cv_scores.mean():.2%} (+/- {cv_scores.std()*2:.2%})")
			
 
				+    
			
 
				+    # 详细报告
			
 
				+    y_pred = clf.predict(X_test)
			
 
				+    print("\n分类报告:")
			
 
				+    print(classification_report(y_test, y_pred, target_names=['震荡', '趋势', '反转']))
			
 
				+    
			
 
				+    # 特征重要性
			
 
				+    feature_importance = pd.DataFrame({
			
 
				+        'feature': X.columns,
			
 
				+        'importance': clf.feature_importances_
			
 
				+    }).sort_values('importance', ascending=False)
			
 
				+    
			
 
				+    print("\n特征重要性 TOP 10:")
			
 
				+    print(feature_importance.head(10).to_string(index=False))
			
 
				+    
			
 
				+    return clf, feature_importance
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主程序"""
			
 
				+    print("="*70)
			
 
				+    print("创业板50市场状态分类器 - 真实数据版")
			
 
				+    print("="*70)
			
 
				+    
			
 
				+    # 1. 获取真实数据
			
 
				+    df = fetch_cyb50_data("2017-01-01", "2025-12-31")
			
 
				+    if df is None:
			
 
				+        return
			
 
				+    
			
 
				+    # 2. 计算特征
			
 
				+    print("\n计算技术指标...")
			
 
				+    features = calculate_features(df)
			
 
				+    print(f"特征数量: {features.shape[1]}")
			
 
				+    
			
 
				+    # 3. 定义标签
			
 
				+    print("\n定义市场状态标签...")
			
 
				+    labels = define_market_regime(df, lookback=10)
			
 
				+    
			
 
				+    # 统计标签分布
			
 
				+    unique, counts = np.unique(labels, return_counts=True)
			
 
				+    print("\n标签分布:")
			
 
				+    state_names = ['震荡', '趋势', '反转']
			
 
				+    for u, c in zip(unique, counts):
			
 
				+        print(f"  {state_names[u]}: {c}天 ({c/len(labels)*100:.1f}%)")
			
 
				+    
			
 
				+    # 4. 训练分类器
			
 
				+    clf, importance = train_classifier(features, labels)
			
 
				+    
			
 
				+    # 5. 当前状态预测
			
 
				+    print("\n" + "="*70)
			
 
				+    print("当前市场状态识别")
			
 
				+    print("="*70)
			
 
				+    
			
 
				+    latest_features = features.iloc[-1:]
			
 
				+    current_pred = clf.predict(latest_features)[0]
			
 
				+    pred_proba = clf.predict_proba(latest_features)[0]
			
 
				+    
			
 
				+    print(f"\n当前日期: {df.index[-1].date()}")
			
 
				+    print(f"当前价格: {df['close'].iloc[-1]:.2f}")
			
 
				+    print(f"\n预测状态: {state_names[current_pred]}")
			
 
				+    print(f"置信度: {pred_proba[current_pred]:.2%}")
			
 
				+    
			
 
				+    print("\n状态概率分布:")
			
 
				+    for i, name in enumerate(state_names):
			
 
				+        bar = '█' * int(pred_proba[i] * 20)
			
 
				+        print(f"  {name}: {pred_proba[i]:.2%} {bar}")
			
 
				+    
			
 
				+    # 保存模型
			
 
				+    print("\n保存模型...")
			
 
				+    import pickle
			
 
				+    with open('/root/.openclaw/workspace/market-regime-identifier/rf_classifier.pkl', 'wb') as f:
			
 
				+        pickle.dump(clf, f)
			
 
				+    print("✓ 模型已保存: rf_classifier.pkl")
			
 
				+    
			
 
				+    print("\n" + "="*70)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()