|
|
@@ -0,0 +1,925 @@
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+import importlib
|
|
|
+import json
|
|
|
+import time
|
|
|
+from pathlib import Path
|
|
|
+from typing import Any, Callable, Mapping
|
|
|
+from urllib.error import HTTPError, URLError
|
|
|
+from urllib.parse import urlencode
|
|
|
+from urllib.request import Request, urlopen
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+
|
|
|
+
|
|
|
+BREADTH_REQUIRED_COLUMNS: tuple[str, ...] = (
|
|
|
+ 'pct_constituents_above_20dma',
|
|
|
+ 'pct_constituents_above_60dma',
|
|
|
+ 'pct_new_high_20',
|
|
|
+ 'pct_new_low_20',
|
|
|
+ 'eq_weight_ret_5',
|
|
|
+ 'weighted_ret_5',
|
|
|
+ 'top3_contribution_5',
|
|
|
+ 'top1_contribution_5',
|
|
|
+ 'top10_contribution_5',
|
|
|
+ 'sector_concentration_20',
|
|
|
+ 'corr_spike_20',
|
|
|
+ 'dispersion_20',
|
|
|
+)
|
|
|
+
|
|
|
+MAIRUI_BASE_URL = 'https://api.mairuiapi.com'
|
|
|
+DEFAULT_WARMUP_OBSERVATIONS: dict[str, int] = {
|
|
|
+ 'pct_constituents_above_20dma': 40,
|
|
|
+ 'pct_constituents_above_60dma': 60,
|
|
|
+ 'pct_new_high_20': 40,
|
|
|
+ 'pct_new_low_20': 40,
|
|
|
+ 'sector_concentration_20': 40,
|
|
|
+ 'corr_spike_20': 20,
|
|
|
+ 'dispersion_20': 20,
|
|
|
+ 'concentration_spread_5': 20,
|
|
|
+}
|
|
|
+CACHE_BOUNDARY_TOLERANCE_DAYS = 10
|
|
|
+META_FETCH_MAX_ATTEMPTS = 3
|
|
|
+
|
|
|
+
|
|
|
+def _to_yyyymmdd(value: str | None) -> str | None:
|
|
|
+ if value is None:
|
|
|
+ return None
|
|
|
+ return pd.Timestamp(value).strftime('%Y%m%d')
|
|
|
+
|
|
|
+
|
|
|
+def _load_akshare() -> Any:
|
|
|
+ try:
|
|
|
+ return importlib.import_module('akshare')
|
|
|
+ except ImportError as exc:
|
|
|
+ raise RuntimeError('derived breadth requires dependency "akshare". Install it first.') from exc
|
|
|
+
|
|
|
+
|
|
|
+def _load_json_url(url: str) -> Any:
|
|
|
+ request = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
|
|
+ last_error: Exception | None = None
|
|
|
+ for attempt in range(5):
|
|
|
+ try:
|
|
|
+ with urlopen(request, timeout=30) as resp:
|
|
|
+ payload = json.loads(resp.read().decode('utf-8'))
|
|
|
+ if isinstance(payload, dict) and 'error' in payload:
|
|
|
+ raise ValueError(f'Mairui API error: {payload["error"]}')
|
|
|
+ return payload
|
|
|
+ except HTTPError as exc:
|
|
|
+ last_error = exc
|
|
|
+ if exc.code not in {429, 500, 502, 503, 504} or attempt == 4:
|
|
|
+ raise
|
|
|
+ time.sleep(1.0 + attempt * 1.5)
|
|
|
+ except URLError as exc:
|
|
|
+ last_error = exc
|
|
|
+ if attempt == 4:
|
|
|
+ raise
|
|
|
+ time.sleep(1.0 + attempt * 1.5)
|
|
|
+ if last_error is not None:
|
|
|
+ raise last_error
|
|
|
+ raise RuntimeError(f'Failed to fetch url: {url}')
|
|
|
+
|
|
|
+
|
|
|
+def _normalize_close_panel(df: pd.DataFrame, source_label: str) -> pd.DataFrame:
|
|
|
+ rename_map = {
|
|
|
+ 'date': 'date',
|
|
|
+ 'close': 'close',
|
|
|
+ 'c': 'close',
|
|
|
+ 't': 'date',
|
|
|
+ }
|
|
|
+ out = df.rename(columns=rename_map).copy()
|
|
|
+ out.columns = [str(col).strip().lower() for col in out.columns]
|
|
|
+ if 'date' not in out.columns or 'close' not in out.columns:
|
|
|
+ raise ValueError(f'{source_label} must contain date and close columns.')
|
|
|
+ out['date'] = pd.to_datetime(out['date'], errors='coerce')
|
|
|
+ out['close'] = pd.to_numeric(out['close'], errors='coerce')
|
|
|
+ out = out.dropna(subset=['date'])
|
|
|
+ out = out[['date', 'close']].drop_duplicates(subset=['date'], keep='last').sort_values('date')
|
|
|
+ return out.set_index('date')
|
|
|
+
|
|
|
+
|
|
|
+def _fetch_constituents_akshare(index_symbol: str) -> pd.DataFrame:
|
|
|
+ ak = _load_akshare()
|
|
|
+ raw = ak.index_stock_cons(symbol=index_symbol)
|
|
|
+ if raw is None or raw.empty:
|
|
|
+ raise ValueError(f'Akshare returned empty constituents for index {index_symbol}.')
|
|
|
+ out = raw.copy()
|
|
|
+ out.columns = [str(col).strip().lower() for col in out.columns]
|
|
|
+
|
|
|
+ code_col = next(
|
|
|
+ (
|
|
|
+ col
|
|
|
+ for col in out.columns
|
|
|
+ if out[col].astype(str).str.extract(r'(\d{6})', expand=False).notna().mean() > 0.5
|
|
|
+ ),
|
|
|
+ out.columns[0],
|
|
|
+ )
|
|
|
+ date_col = next(
|
|
|
+ (
|
|
|
+ col
|
|
|
+ for col in out.columns
|
|
|
+ if out[col]
|
|
|
+ .astype(str)
|
|
|
+ .str.strip()
|
|
|
+ .str.match(r'^(\\d{4}[-/]\\d{1,2}[-/]\\d{1,2}|\\d{8})$')
|
|
|
+ .mean()
|
|
|
+ > 0.5
|
|
|
+ ),
|
|
|
+ out.columns[min(2, len(out.columns) - 1)],
|
|
|
+ )
|
|
|
+ name_col = next((col for col in out.columns if col not in {code_col, date_col}), out.columns[min(1, len(out.columns) - 1)])
|
|
|
+
|
|
|
+ entry_raw = out[date_col].astype(str).str.strip()
|
|
|
+ entry_mask = entry_raw.str.match(r'^(\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{8})$', na=False)
|
|
|
+ rows = pd.DataFrame(
|
|
|
+ {
|
|
|
+ 'symbol': out[code_col].astype(str).str.extract(r'(\d{6})', expand=False),
|
|
|
+ 'name': out[name_col].astype(str),
|
|
|
+ 'entry_date': pd.to_datetime(entry_raw.where(entry_mask), errors='coerce'),
|
|
|
+ }
|
|
|
+ )
|
|
|
+ rows = rows.dropna(subset=['symbol']).drop_duplicates(subset=['symbol'], keep='last').sort_values('symbol')
|
|
|
+ if rows.empty:
|
|
|
+ raise ValueError(f'No valid constituent symbols parsed for index {index_symbol}.')
|
|
|
+ return rows.reset_index(drop=True)
|
|
|
+
|
|
|
+
|
|
|
+def _fetch_stock_history_akshare(symbol: str, start_date: str | None, end_date: str | None) -> pd.DataFrame:
|
|
|
+ ak = _load_akshare()
|
|
|
+ st = _to_yyyymmdd(start_date) or '20050101'
|
|
|
+ et = _to_yyyymmdd(end_date) or pd.Timestamp.today().strftime('%Y%m%d')
|
|
|
+ try:
|
|
|
+ raw = ak.stock_zh_a_hist(symbol=symbol, period='daily', start_date=st, end_date=et, adjust='')
|
|
|
+ except Exception as primary_exc:
|
|
|
+ # Eastmoney endpoint is occasionally unstable; fall back to Tencent line.
|
|
|
+ return _fetch_stock_history_akshare_tx(symbol=symbol, start_date=start_date, end_date=end_date, primary_exc=primary_exc)
|
|
|
+ if raw is None or raw.empty:
|
|
|
+ return _fetch_stock_history_akshare_tx(symbol=symbol, start_date=start_date, end_date=end_date, primary_exc=ValueError('empty panel'))
|
|
|
+
|
|
|
+ date_col = raw.columns[0]
|
|
|
+ close_col = None
|
|
|
+ lower_cols = [str(col).strip().lower() for col in raw.columns]
|
|
|
+ if 'close' in lower_cols:
|
|
|
+ close_col = raw.columns[lower_cols.index('close')]
|
|
|
+ elif len(raw.columns) >= 3:
|
|
|
+ close_col = raw.columns[2]
|
|
|
+ else:
|
|
|
+ numeric_cols = [
|
|
|
+ col for col in raw.columns[1:] if pd.to_numeric(raw[col], errors='coerce').notna().mean() > 0.5
|
|
|
+ ]
|
|
|
+ if numeric_cols:
|
|
|
+ close_col = numeric_cols[0]
|
|
|
+
|
|
|
+ if close_col is None:
|
|
|
+ raise ValueError(f'Unable to detect close column from Akshare history for symbol {symbol}.')
|
|
|
+
|
|
|
+ normalized = raw.rename(columns={date_col: 'date', close_col: 'close'})[['date', 'close']]
|
|
|
+ return _normalize_close_panel(normalized, source_label=f'akshare_stock_{symbol}')
|
|
|
+
|
|
|
+
|
|
|
+def _fetch_stock_history_akshare_tx(
|
|
|
+ *,
|
|
|
+ symbol: str,
|
|
|
+ start_date: str | None,
|
|
|
+ end_date: str | None,
|
|
|
+ primary_exc: Exception,
|
|
|
+) -> pd.DataFrame:
|
|
|
+ ak = _load_akshare()
|
|
|
+ tx_symbol = f"{'sh' if symbol.startswith(('6', '9')) else 'sz'}{symbol}"
|
|
|
+ st = _to_yyyymmdd(start_date) or '20050101'
|
|
|
+ et = _to_yyyymmdd(end_date) or pd.Timestamp.today().strftime('%Y%m%d')
|
|
|
+ try:
|
|
|
+ raw = ak.stock_zh_a_hist_tx(symbol=tx_symbol, start_date=st, end_date=et, adjust='')
|
|
|
+ except Exception as tx_exc:
|
|
|
+ raise ValueError(f'Akshare history failed for {symbol}: primary={primary_exc}; tx={tx_exc}') from tx_exc
|
|
|
+ if raw is None or raw.empty:
|
|
|
+ raise ValueError(f'Akshare history failed for {symbol}: primary={primary_exc}; tx=empty panel')
|
|
|
+ normalized = raw.rename(columns={'date': 'date', 'close': 'close'})[['date', 'close']]
|
|
|
+ return _normalize_close_panel(normalized, source_label=f'akshare_tx_stock_{symbol}')
|
|
|
+
|
|
|
+
|
|
|
+def _infer_mairui_exchange(symbol: str) -> str:
|
|
|
+ return 'SH' if symbol.startswith(('6', '9')) else 'SZ'
|
|
|
+
|
|
|
+
|
|
|
+def _fetch_stock_history_mairui(symbol: str, start_date: str | None, end_date: str | None, licence: str) -> pd.DataFrame:
|
|
|
+ if not licence:
|
|
|
+ raise ValueError('Mairui licence is required for fallback stock history fetch.')
|
|
|
+ code = f'{symbol}.{_infer_mairui_exchange(symbol)}'
|
|
|
+ path = f'/hsstock/history/{code}/d/n/{licence}'
|
|
|
+ params: dict[str, str] = {}
|
|
|
+ st = _to_yyyymmdd(start_date)
|
|
|
+ et = _to_yyyymmdd(end_date)
|
|
|
+ if st:
|
|
|
+ params['st'] = st
|
|
|
+ if et:
|
|
|
+ params['et'] = et
|
|
|
+ if params:
|
|
|
+ path = f'{path}?{urlencode(params)}'
|
|
|
+ payload = _load_json_url(f'{MAIRUI_BASE_URL}{path}')
|
|
|
+ if not isinstance(payload, list) or not payload:
|
|
|
+ raise ValueError(f'Mairui returned empty history for symbol {symbol}.')
|
|
|
+ return _normalize_close_panel(pd.DataFrame(payload), source_label=f'mairui_stock_{symbol}')
|
|
|
+
|
|
|
+
|
|
|
+def _to_float_or_none(value: Any) -> float | None:
|
|
|
+ if value is None:
|
|
|
+ return None
|
|
|
+ text = str(value).strip().replace(',', '')
|
|
|
+ if not text:
|
|
|
+ return None
|
|
|
+ try:
|
|
|
+ return float(text)
|
|
|
+ except ValueError:
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def _load_cached_close_history(path: Path) -> pd.DataFrame | None:
|
|
|
+ if not path.exists():
|
|
|
+ return None
|
|
|
+ try:
|
|
|
+ cached = pd.read_csv(path)
|
|
|
+ except Exception:
|
|
|
+ return None
|
|
|
+ if cached.empty:
|
|
|
+ return None
|
|
|
+ try:
|
|
|
+ return _normalize_close_panel(cached, source_label=f'cache_{path.name}')
|
|
|
+ except Exception:
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def _load_cached_meta(path: Path) -> dict[str, dict[str, Any]]:
|
|
|
+ if not path.exists():
|
|
|
+ return {}
|
|
|
+ try:
|
|
|
+ payload = json.loads(path.read_text(encoding='utf-8'))
|
|
|
+ except Exception:
|
|
|
+ return {}
|
|
|
+ if not isinstance(payload, dict):
|
|
|
+ return {}
|
|
|
+ out: dict[str, dict[str, Any]] = {}
|
|
|
+ for symbol, item in payload.items():
|
|
|
+ if not isinstance(item, dict):
|
|
|
+ continue
|
|
|
+ out[str(symbol).zfill(6)] = dict(item)
|
|
|
+ return out
|
|
|
+
|
|
|
+
|
|
|
+def _persist_cached_meta(path: Path, cache: Mapping[str, Mapping[str, Any]]) -> None:
|
|
|
+ serializable: dict[str, dict[str, Any]] = {}
|
|
|
+ for symbol, item in cache.items():
|
|
|
+ serializable[str(symbol).zfill(6)] = {
|
|
|
+ 'industry': str(item.get('industry') or 'unknown'),
|
|
|
+ 'float_shares': _to_float_or_none(item.get('float_shares')),
|
|
|
+ 'provider': str(item.get('provider') or 'unknown'),
|
|
|
+ 'error': str(item.get('error') or ''),
|
|
|
+ }
|
|
|
+ path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2), encoding='utf-8')
|
|
|
+
|
|
|
+
|
|
|
+def _history_covers_range(history: pd.DataFrame, start_date: str | None, end_date: str | None) -> bool:
|
|
|
+ if history is None or history.empty:
|
|
|
+ return False
|
|
|
+ start = pd.Timestamp(start_date) if start_date else None
|
|
|
+ end = pd.Timestamp(end_date) if end_date else None
|
|
|
+ min_dt = history.index.min()
|
|
|
+ max_dt = history.index.max()
|
|
|
+ if start is not None and min_dt > start:
|
|
|
+ gap_days = int((min_dt - start).days)
|
|
|
+ if gap_days > CACHE_BOUNDARY_TOLERANCE_DAYS:
|
|
|
+ return False
|
|
|
+ if end is not None and max_dt < end:
|
|
|
+ gap_days = int((end - max_dt).days)
|
|
|
+ if gap_days > CACHE_BOUNDARY_TOLERANCE_DAYS:
|
|
|
+ return False
|
|
|
+ if start is not None and max_dt < start:
|
|
|
+ return False
|
|
|
+ if end is not None and min_dt > end:
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+def _extract_key_value_map(raw: pd.DataFrame) -> dict[str, Any]:
|
|
|
+ if raw is None or raw.empty:
|
|
|
+ return {}
|
|
|
+ out = raw.copy()
|
|
|
+ cols = [str(col).strip() for col in out.columns]
|
|
|
+ lower_cols = [col.lower() for col in cols]
|
|
|
+
|
|
|
+ item_col = None
|
|
|
+ value_col = None
|
|
|
+ for candidate in ('item', '\u9879\u76ee', 'name', '\u540d\u79f0'):
|
|
|
+ if candidate in lower_cols:
|
|
|
+ item_col = out.columns[lower_cols.index(candidate)]
|
|
|
+ break
|
|
|
+ for candidate in ('value', '\u503c', '\u6570\u503c'):
|
|
|
+ if candidate in lower_cols:
|
|
|
+ value_col = out.columns[lower_cols.index(candidate)]
|
|
|
+ break
|
|
|
+
|
|
|
+ if item_col is None and len(out.columns) >= 1:
|
|
|
+ item_col = out.columns[0]
|
|
|
+ if value_col is None and len(out.columns) >= 2:
|
|
|
+ value_col = out.columns[1]
|
|
|
+ if item_col is None or value_col is None:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ values: dict[str, Any] = {}
|
|
|
+ for key, value in zip(out[item_col], out[value_col], strict=False):
|
|
|
+ key_text = str(key).strip()
|
|
|
+ if not key_text:
|
|
|
+ continue
|
|
|
+ values[key_text] = value
|
|
|
+ values[key_text.lower()] = value
|
|
|
+ return values
|
|
|
+
|
|
|
+
|
|
|
+def _parse_stock_meta_values(values: Mapping[str, Any]) -> dict[str, Any]:
|
|
|
+ float_shares = None
|
|
|
+ for key, value in values.items():
|
|
|
+ key_text = str(key).strip().lower()
|
|
|
+ if 'float' in key_text or 'share' in key_text or 'circul' in key_text or '\u6d41\u901a' in key_text:
|
|
|
+ parsed = _to_float_or_none(value)
|
|
|
+ if parsed and parsed > 0:
|
|
|
+ float_shares = parsed
|
|
|
+ break
|
|
|
+ if float_shares is None:
|
|
|
+ numeric_candidates = [_to_float_or_none(v) for v in values.values()]
|
|
|
+ numeric_candidates = [v for v in numeric_candidates if v is not None and v > 0]
|
|
|
+ if numeric_candidates:
|
|
|
+ float_shares = max(numeric_candidates)
|
|
|
+
|
|
|
+ industry = 'unknown'
|
|
|
+ for key, value in values.items():
|
|
|
+ key_text = str(key).strip().lower()
|
|
|
+ if 'industry' in key_text or '\u884c\u4e1a' in key_text:
|
|
|
+ text = str(value).strip()
|
|
|
+ if text:
|
|
|
+ industry = text
|
|
|
+ break
|
|
|
+ return {'industry': industry, 'float_shares': float_shares}
|
|
|
+
|
|
|
+
|
|
|
+def _fetch_stock_meta_akshare(symbol: str) -> dict[str, Any]:
|
|
|
+ ak = _load_akshare()
|
|
|
+ last_error: Exception | None = None
|
|
|
+ for attempt in range(META_FETCH_MAX_ATTEMPTS):
|
|
|
+ try:
|
|
|
+ raw = ak.stock_individual_info_em(symbol=symbol)
|
|
|
+ values = _extract_key_value_map(raw)
|
|
|
+ if not values:
|
|
|
+ return {'industry': 'unknown', 'float_shares': None}
|
|
|
+ return _parse_stock_meta_values(values)
|
|
|
+ except Exception as exc:
|
|
|
+ last_error = exc
|
|
|
+ if attempt == META_FETCH_MAX_ATTEMPTS - 1:
|
|
|
+ raise
|
|
|
+ time.sleep(0.8 + attempt * 1.2)
|
|
|
+ if last_error is not None:
|
|
|
+ raise last_error
|
|
|
+ return {'industry': 'unknown', 'float_shares': None}
|
|
|
+
|
|
|
+
|
|
|
+def _pick_industry_from_mairui(payload: Any) -> str:
|
|
|
+ if not isinstance(payload, list):
|
|
|
+ return 'unknown'
|
|
|
+ entries: list[str] = []
|
|
|
+ for item in payload:
|
|
|
+ if not isinstance(item, dict):
|
|
|
+ continue
|
|
|
+ labels = [
|
|
|
+ str(item.get('jctype') or '').strip(),
|
|
|
+ str(item.get('type') or '').strip(),
|
|
|
+ ]
|
|
|
+ names = [
|
|
|
+ str(item.get('jcmc') or '').strip(),
|
|
|
+ str(item.get('name') or '').strip(),
|
|
|
+ ]
|
|
|
+ if any('\u884c\u4e1a' in label for label in labels):
|
|
|
+ for name in names:
|
|
|
+ if name:
|
|
|
+ return name
|
|
|
+ for name in names:
|
|
|
+ if name:
|
|
|
+ entries.append(name)
|
|
|
+ return entries[0] if entries else 'unknown'
|
|
|
+
|
|
|
+
|
|
|
+def _fetch_stock_meta_mairui(symbol: str, licence: str) -> dict[str, Any]:
|
|
|
+ if not licence:
|
|
|
+ raise ValueError('Mairui licence is required for stock metadata fallback fetch.')
|
|
|
+ url = f'{MAIRUI_BASE_URL}/hszg/zg/{symbol}/{licence}'
|
|
|
+ payload = _load_json_url(url)
|
|
|
+ industry = _pick_industry_from_mairui(payload)
|
|
|
+ return {'industry': industry, 'float_shares': None}
|
|
|
+
|
|
|
+
|
|
|
+def _resolve_symbol_meta(
|
|
|
+ *,
|
|
|
+ symbol: str,
|
|
|
+ mairui_licence: str | None,
|
|
|
+ fetch_meta: Callable[[str], Mapping[str, Any]],
|
|
|
+ fetch_meta_fallback: Callable[[str, str], Mapping[str, Any]],
|
|
|
+) -> tuple[dict[str, Any], str, str | None]:
|
|
|
+ errors: list[str] = []
|
|
|
+ provider = 'akshare'
|
|
|
+ meta: dict[str, Any] = {}
|
|
|
+ try:
|
|
|
+ meta = dict(fetch_meta(symbol))
|
|
|
+ except Exception as exc:
|
|
|
+ errors.append(f'akshare_meta={exc}')
|
|
|
+ meta = {}
|
|
|
+
|
|
|
+ industry_text = str(meta.get('industry') or '').strip()
|
|
|
+ if industry_text and industry_text.lower() != 'unknown':
|
|
|
+ return meta, provider, ('; '.join(errors) if errors else None)
|
|
|
+
|
|
|
+ if mairui_licence:
|
|
|
+ try:
|
|
|
+ fallback_meta = dict(fetch_meta_fallback(symbol, mairui_licence))
|
|
|
+ fallback_industry = str(fallback_meta.get('industry') or '').strip()
|
|
|
+ if fallback_industry and fallback_industry.lower() != 'unknown':
|
|
|
+ provider = 'mairui'
|
|
|
+ if not meta.get('float_shares') and fallback_meta.get('float_shares'):
|
|
|
+ meta['float_shares'] = fallback_meta.get('float_shares')
|
|
|
+ meta['industry'] = fallback_industry
|
|
|
+ return meta, provider, ('; '.join(errors) if errors else None)
|
|
|
+ except Exception as exc:
|
|
|
+ errors.append(f'mairui_meta={exc}')
|
|
|
+
|
|
|
+ if not industry_text:
|
|
|
+ meta['industry'] = 'unknown'
|
|
|
+ return meta, provider, ('; '.join(errors) if errors else None)
|
|
|
+
|
|
|
+
|
|
|
+def _mean_pairwise_corr(window: pd.DataFrame) -> float | None:
|
|
|
+ valid_cols = [col for col in window.columns if int(window[col].notna().sum()) >= 5]
|
|
|
+ if len(valid_cols) < 2:
|
|
|
+ return None
|
|
|
+ corr = window[valid_cols].corr()
|
|
|
+ if corr.empty:
|
|
|
+ return None
|
|
|
+ values = corr.to_numpy(dtype=float)
|
|
|
+ mask = np.triu(np.ones(values.shape, dtype=bool), k=1)
|
|
|
+ flat = values[mask]
|
|
|
+ flat = flat[~np.isnan(flat)]
|
|
|
+ if flat.size == 0:
|
|
|
+ return None
|
|
|
+ return float(flat.mean())
|
|
|
+
|
|
|
+
|
|
|
+def _compute_sector_concentration(weights: pd.DataFrame, industries: pd.Series) -> tuple[pd.Series, str]:
|
|
|
+ groups: dict[str, list[str]] = {}
|
|
|
+ for symbol, industry in industries.items():
|
|
|
+ key = str(industry).strip()
|
|
|
+ if not key:
|
|
|
+ key = 'unknown'
|
|
|
+ groups.setdefault(key, []).append(symbol)
|
|
|
+
|
|
|
+ known_groups = {k: v for k, v in groups.items() if k.lower() != 'unknown'}
|
|
|
+ if len(known_groups) < 2:
|
|
|
+ # Fallback: use concentration HHI when industry metadata is insufficient.
|
|
|
+ hhi = weights.fillna(0.0).pow(2).sum(axis=1, min_count=1)
|
|
|
+ hhi.index = weights.index
|
|
|
+ return hhi, 'weight_hhi_proxy'
|
|
|
+
|
|
|
+ out = pd.Series(index=weights.index, dtype=float)
|
|
|
+ for dt, row in weights.iterrows():
|
|
|
+ best = np.nan
|
|
|
+ for symbols in known_groups.values():
|
|
|
+ value = row[symbols].sum(min_count=1)
|
|
|
+ if pd.isna(value):
|
|
|
+ continue
|
|
|
+ best = float(value) if pd.isna(best) else float(max(best, value))
|
|
|
+ out.loc[dt] = best
|
|
|
+ return out, 'industry_max_share'
|
|
|
+
|
|
|
+
|
|
|
+def _build_required_breadth_columns(
|
|
|
+ close_panel: pd.DataFrame,
|
|
|
+ float_shares: pd.Series,
|
|
|
+ industries: pd.Series,
|
|
|
+) -> tuple[pd.DataFrame, dict[str, Any]]:
|
|
|
+ close = close_panel.sort_index()
|
|
|
+ active_mask = close.notna()
|
|
|
+ ret1 = close.pct_change()
|
|
|
+ ret5 = close.div(close.shift(5)).sub(1.0)
|
|
|
+
|
|
|
+ ma20 = close.rolling(20, min_periods=20).mean()
|
|
|
+ ma60 = close.rolling(60, min_periods=60).mean()
|
|
|
+ high20 = close.rolling(20, min_periods=20).max()
|
|
|
+ low20 = close.rolling(20, min_periods=20).min()
|
|
|
+
|
|
|
+ pct_above_20 = close.gt(ma20).where(active_mask).mean(axis=1, skipna=True)
|
|
|
+ pct_above_60 = close.gt(ma60).where(active_mask).mean(axis=1, skipna=True)
|
|
|
+ pct_new_high_20 = close.ge(high20).where(active_mask).mean(axis=1, skipna=True)
|
|
|
+ pct_new_low_20 = close.le(low20).where(active_mask).mean(axis=1, skipna=True)
|
|
|
+
|
|
|
+ eq_weight_ret_5 = ret5.mean(axis=1, skipna=True)
|
|
|
+ caps = close.mul(float_shares.reindex(close.columns).fillna(1.0), axis=1)
|
|
|
+ weights = caps.div(caps.sum(axis=1), axis=0)
|
|
|
+ weighted_ret_5 = weights.mul(ret5).sum(axis=1, min_count=1)
|
|
|
+
|
|
|
+ top1 = weights.apply(lambda row: float(row.dropna().nlargest(1).sum()) if row.notna().any() else np.nan, axis=1)
|
|
|
+ top3 = weights.apply(lambda row: float(row.dropna().nlargest(3).sum()) if row.notna().any() else np.nan, axis=1)
|
|
|
+ top10 = weights.apply(lambda row: float(row.dropna().nlargest(10).sum()) if row.notna().any() else np.nan, axis=1)
|
|
|
+
|
|
|
+ sector_raw, sector_mode = _compute_sector_concentration(weights, industries.reindex(close.columns).fillna('unknown'))
|
|
|
+ sector_concentration_20 = sector_raw.rolling(20, min_periods=5).mean()
|
|
|
+
|
|
|
+ corr_spike_values: list[float | None] = []
|
|
|
+ for i in range(len(ret1)):
|
|
|
+ start_i = max(0, i - 19)
|
|
|
+ corr_spike_values.append(_mean_pairwise_corr(ret1.iloc[start_i : i + 1]))
|
|
|
+ corr_spike_20 = pd.Series(corr_spike_values, index=ret1.index, dtype=float)
|
|
|
+
|
|
|
+ dispersion_20 = ret1.std(axis=1, skipna=True).rolling(20, min_periods=5).mean()
|
|
|
+
|
|
|
+ out = pd.DataFrame(
|
|
|
+ {
|
|
|
+ 'pct_constituents_above_20dma': pct_above_20,
|
|
|
+ 'pct_constituents_above_60dma': pct_above_60,
|
|
|
+ 'pct_new_high_20': pct_new_high_20,
|
|
|
+ 'pct_new_low_20': pct_new_low_20,
|
|
|
+ 'eq_weight_ret_5': eq_weight_ret_5,
|
|
|
+ 'weighted_ret_5': weighted_ret_5,
|
|
|
+ 'top3_contribution_5': top3,
|
|
|
+ 'top1_contribution_5': top1,
|
|
|
+ 'top10_contribution_5': top10,
|
|
|
+ 'sector_concentration_20': sector_concentration_20,
|
|
|
+ 'corr_spike_20': corr_spike_20,
|
|
|
+ 'dispersion_20': dispersion_20,
|
|
|
+ },
|
|
|
+ index=close.index,
|
|
|
+ )
|
|
|
+ out.index.name = 'date'
|
|
|
+ diagnostics = {
|
|
|
+ 'sector_concentration_mode': sector_mode,
|
|
|
+ }
|
|
|
+ return out[list(BREADTH_REQUIRED_COLUMNS)], diagnostics
|
|
|
+
|
|
|
+
|
|
|
+def derive_breadth_sidecar(
|
|
|
+ *,
|
|
|
+ start_date: str | None,
|
|
|
+ end_date: str | None,
|
|
|
+ index_symbol: str = '399673',
|
|
|
+ mairui_licence: str | None = None,
|
|
|
+ min_active_constituents: int = 20,
|
|
|
+ max_constituents: int | None = None,
|
|
|
+ cache_dir: str | Path | None = None,
|
|
|
+ constituent_fetcher: Callable[[str], pd.DataFrame] | None = None,
|
|
|
+ stock_history_fetcher: Callable[[str, str | None, str | None], pd.DataFrame] | None = None,
|
|
|
+ stock_history_fallback_fetcher: Callable[[str, str | None, str | None, str], pd.DataFrame] | None = None,
|
|
|
+ stock_meta_fetcher: Callable[[str], Mapping[str, Any]] | None = None,
|
|
|
+ stock_meta_fallback_fetcher: Callable[[str, str], Mapping[str, Any]] | None = None,
|
|
|
+) -> tuple[pd.DataFrame, dict[str, Any]]:
|
|
|
+ fetch_constituents = constituent_fetcher or _fetch_constituents_akshare
|
|
|
+ fetch_history = stock_history_fetcher or _fetch_stock_history_akshare
|
|
|
+ fetch_history_fallback = stock_history_fallback_fetcher or _fetch_stock_history_mairui
|
|
|
+ fetch_meta = stock_meta_fetcher or _fetch_stock_meta_akshare
|
|
|
+ fetch_meta_fallback = stock_meta_fallback_fetcher or _fetch_stock_meta_mairui
|
|
|
+
|
|
|
+ constituents = fetch_constituents(index_symbol)
|
|
|
+ symbols = [str(sym).zfill(6) for sym in constituents['symbol'].tolist()]
|
|
|
+ if max_constituents is not None and int(max_constituents) > 0:
|
|
|
+ symbols = symbols[: int(max_constituents)]
|
|
|
+ if not symbols:
|
|
|
+ raise ValueError(f'No constituents available for index {index_symbol}.')
|
|
|
+
|
|
|
+ close_series_map: dict[str, pd.Series] = {}
|
|
|
+ industries: dict[str, str] = {}
|
|
|
+ float_shares: dict[str, float] = {}
|
|
|
+ provider_by_symbol: dict[str, str] = {}
|
|
|
+ missing_symbols: list[str] = []
|
|
|
+ errors: dict[str, str] = {}
|
|
|
+ metadata_provider_by_symbol: dict[str, str] = {}
|
|
|
+ metadata_errors: dict[str, str] = {}
|
|
|
+ cache_hit_count = 0
|
|
|
+ cache_miss_count = 0
|
|
|
+ cache_root: Path | None = Path(cache_dir) if cache_dir else None
|
|
|
+ meta_cache_hit_count = 0
|
|
|
+ meta_cache_miss_count = 0
|
|
|
+ meta_cache_root: dict[str, dict[str, Any]] = {}
|
|
|
+ meta_cache_path: Path | None = None
|
|
|
+ if cache_root is not None:
|
|
|
+ cache_root.mkdir(parents=True, exist_ok=True)
|
|
|
+ meta_cache_path = cache_root / '_meta_cache.json'
|
|
|
+ meta_cache_root = _load_cached_meta(meta_cache_path)
|
|
|
+
|
|
|
+ def _resolve_meta_with_cache(symbol: str) -> tuple[dict[str, Any], str, str | None]:
|
|
|
+ nonlocal meta_cache_hit_count, meta_cache_miss_count, meta_cache_root
|
|
|
+ cached_meta = meta_cache_root.get(symbol)
|
|
|
+ if cached_meta is not None:
|
|
|
+ meta_cache_hit_count += 1
|
|
|
+ return (
|
|
|
+ {
|
|
|
+ 'industry': str(cached_meta.get('industry') or 'unknown'),
|
|
|
+ 'float_shares': _to_float_or_none(cached_meta.get('float_shares')),
|
|
|
+ },
|
|
|
+ str(cached_meta.get('provider') or 'unknown'),
|
|
|
+ str(cached_meta.get('error') or '').strip() or None,
|
|
|
+ )
|
|
|
+
|
|
|
+ meta_cache_miss_count += 1
|
|
|
+ meta, provider, meta_error = _resolve_symbol_meta(
|
|
|
+ symbol=symbol,
|
|
|
+ mairui_licence=mairui_licence,
|
|
|
+ fetch_meta=fetch_meta,
|
|
|
+ fetch_meta_fallback=fetch_meta_fallback,
|
|
|
+ )
|
|
|
+ meta_cache_root[symbol] = {
|
|
|
+ 'industry': str(meta.get('industry') or 'unknown'),
|
|
|
+ 'float_shares': _to_float_or_none(meta.get('float_shares')),
|
|
|
+ 'provider': provider,
|
|
|
+ 'error': str(meta_error or ''),
|
|
|
+ }
|
|
|
+ return meta, provider, meta_error
|
|
|
+
|
|
|
+ for symbol in symbols:
|
|
|
+ cache_path = (cache_root / f'{symbol}.csv') if cache_root is not None else None
|
|
|
+ if cache_path is not None:
|
|
|
+ cached_panel = _load_cached_close_history(cache_path)
|
|
|
+ if cached_panel is not None and _history_covers_range(cached_panel, start_date, end_date):
|
|
|
+ close_series_map[symbol] = cached_panel['close']
|
|
|
+ provider_by_symbol[symbol] = 'cache'
|
|
|
+ cache_hit_count += 1
|
|
|
+ meta, metadata_provider, meta_error = _resolve_meta_with_cache(symbol)
|
|
|
+ metadata_provider_by_symbol[symbol] = metadata_provider
|
|
|
+ if meta_error:
|
|
|
+ metadata_errors[symbol] = meta_error
|
|
|
+ industries[symbol] = str(meta.get('industry') or 'unknown')
|
|
|
+ float_value = _to_float_or_none(meta.get('float_shares'))
|
|
|
+ float_shares[symbol] = float_value if float_value and float_value > 0 else 1.0
|
|
|
+ continue
|
|
|
+ cache_miss_count += 1
|
|
|
+
|
|
|
+ panel: pd.DataFrame | None = None
|
|
|
+ provider = 'akshare'
|
|
|
+ try:
|
|
|
+ panel = fetch_history(symbol, start_date, end_date)
|
|
|
+ if panel is None or panel.empty:
|
|
|
+ raise ValueError('empty panel')
|
|
|
+ except Exception as primary_exc:
|
|
|
+ if mairui_licence:
|
|
|
+ try:
|
|
|
+ panel = fetch_history_fallback(symbol, start_date, end_date, mairui_licence)
|
|
|
+ provider = 'mairui'
|
|
|
+ except Exception as fallback_exc:
|
|
|
+ errors[symbol] = f'akshare={primary_exc}; mairui={fallback_exc}'
|
|
|
+ else:
|
|
|
+ errors[symbol] = str(primary_exc)
|
|
|
+
|
|
|
+ if panel is None or panel.empty:
|
|
|
+ missing_symbols.append(symbol)
|
|
|
+ continue
|
|
|
+
|
|
|
+ normalized = _normalize_close_panel(panel.reset_index(), source_label=f'stock_{symbol}')
|
|
|
+ if normalized.empty:
|
|
|
+ missing_symbols.append(symbol)
|
|
|
+ errors[symbol] = 'empty normalized close series'
|
|
|
+ continue
|
|
|
+
|
|
|
+ close_series_map[symbol] = normalized['close']
|
|
|
+ provider_by_symbol[symbol] = provider
|
|
|
+ if cache_path is not None:
|
|
|
+ normalized.reset_index().to_csv(cache_path, index=False)
|
|
|
+ meta, metadata_provider, meta_error = _resolve_meta_with_cache(symbol)
|
|
|
+ metadata_provider_by_symbol[symbol] = metadata_provider
|
|
|
+ if meta_error:
|
|
|
+ metadata_errors[symbol] = meta_error
|
|
|
+ industries[symbol] = str(meta.get('industry') or 'unknown')
|
|
|
+ float_value = _to_float_or_none(meta.get('float_shares'))
|
|
|
+ float_shares[symbol] = float_value if float_value and float_value > 0 else 1.0
|
|
|
+
|
|
|
+ if not close_series_map:
|
|
|
+ raise ValueError(f'Unable to fetch any constituent history for index {index_symbol}.')
|
|
|
+
|
|
|
+ close_panel = pd.concat(close_series_map, axis=1).sort_index()
|
|
|
+ close_panel.index.name = 'date'
|
|
|
+ if start_date:
|
|
|
+ close_panel = close_panel.loc[close_panel.index >= pd.Timestamp(start_date)]
|
|
|
+ if end_date:
|
|
|
+ close_panel = close_panel.loc[close_panel.index <= pd.Timestamp(end_date)]
|
|
|
+ if close_panel.empty:
|
|
|
+ raise ValueError('Derived breadth close panel is empty after date filtering.')
|
|
|
+
|
|
|
+ active_count = close_panel.notna().sum(axis=1)
|
|
|
+ max_active = int(active_count.max()) if len(active_count) else 0
|
|
|
+ if max_active < int(min_active_constituents):
|
|
|
+ raise ValueError(
|
|
|
+ f'Derived breadth active constituent count too low: max_active={max_active}, '
|
|
|
+ f'min_required={int(min_active_constituents)}'
|
|
|
+ )
|
|
|
+
|
|
|
+ breadth, builder_diag = _build_required_breadth_columns(
|
|
|
+ close_panel=close_panel,
|
|
|
+ float_shares=pd.Series(float_shares, dtype=float),
|
|
|
+ industries=pd.Series(industries, dtype=str),
|
|
|
+ )
|
|
|
+
|
|
|
+ industry_series = pd.Series(industries, dtype=str)
|
|
|
+ industry_unknown_count = int(industry_series.str.lower().eq('unknown').sum())
|
|
|
+ industry_unique_count = int(industry_series.str.lower().replace('', 'unknown').nunique())
|
|
|
+
|
|
|
+ provider_counts = pd.Series(provider_by_symbol).value_counts().to_dict()
|
|
|
+ metadata_provider_counts = pd.Series(metadata_provider_by_symbol).value_counts().to_dict()
|
|
|
+ if meta_cache_path is not None:
|
|
|
+ _persist_cached_meta(meta_cache_path, meta_cache_root)
|
|
|
+ metadata = {
|
|
|
+ 'index_symbol': str(index_symbol),
|
|
|
+ 'membership_mode': 'latest_constituents_with_entry_dates',
|
|
|
+ 'constituent_count_total': int(len(constituents)),
|
|
|
+ 'constituent_count_requested': int(len(symbols)),
|
|
|
+ 'constituent_count_used': int(len(close_series_map)),
|
|
|
+ 'missing_symbols': sorted(missing_symbols),
|
|
|
+ 'provider_by_symbol': provider_by_symbol,
|
|
|
+ 'provider_counts': {str(k): int(v) for k, v in provider_counts.items()},
|
|
|
+ 'metadata_provider_by_symbol': metadata_provider_by_symbol,
|
|
|
+ 'metadata_provider_counts': {str(k): int(v) for k, v in metadata_provider_counts.items()},
|
|
|
+ 'metadata_errors': metadata_errors,
|
|
|
+ 'cache': {
|
|
|
+ 'enabled': bool(cache_root is not None),
|
|
|
+ 'path': str(cache_root) if cache_root is not None else None,
|
|
|
+ 'hit_count': int(cache_hit_count),
|
|
|
+ 'miss_count': int(cache_miss_count),
|
|
|
+ },
|
|
|
+ 'meta_cache': {
|
|
|
+ 'enabled': bool(meta_cache_path is not None),
|
|
|
+ 'path': str(meta_cache_path) if meta_cache_path is not None else None,
|
|
|
+ 'hit_count': int(meta_cache_hit_count),
|
|
|
+ 'miss_count': int(meta_cache_miss_count),
|
|
|
+ },
|
|
|
+ 'active_constituent_count': {
|
|
|
+ 'min': int(active_count.min()) if len(active_count) else 0,
|
|
|
+ 'median': float(active_count.median()) if len(active_count) else 0.0,
|
|
|
+ 'max': max_active,
|
|
|
+ },
|
|
|
+ 'date_start': breadth.index.min().date().isoformat() if len(breadth) else None,
|
|
|
+ 'date_end': breadth.index.max().date().isoformat() if len(breadth) else None,
|
|
|
+ 'row_count': int(len(breadth)),
|
|
|
+ 'industry_unknown_count': industry_unknown_count,
|
|
|
+ 'industry_unknown_ratio': float(industry_unknown_count / len(close_series_map)) if close_series_map else 1.0,
|
|
|
+ 'industry_unique_count': industry_unique_count,
|
|
|
+ 'sector_concentration_mode': builder_diag['sector_concentration_mode'],
|
|
|
+ 'errors': errors,
|
|
|
+ }
|
|
|
+ return breadth, metadata
|
|
|
+
|
|
|
+
|
|
|
+def evaluate_breadth_source_integrity(
|
|
|
+ breadth: pd.DataFrame,
|
|
|
+ *,
|
|
|
+ required_columns: tuple[str, ...] | list[str] = BREADTH_REQUIRED_COLUMNS,
|
|
|
+ min_unique_non_null: int = 3,
|
|
|
+ max_dominant_value_ratio: float = 0.995,
|
|
|
+ std_floor: float = 1e-8,
|
|
|
+ strict: bool = False,
|
|
|
+ warmup_observations: Mapping[str, int] | None = None,
|
|
|
+) -> dict[str, Any]:
|
|
|
+ panel = breadth.copy()
|
|
|
+ panel.columns = [str(col).strip().lower() for col in panel.columns]
|
|
|
+ req = [str(col).strip().lower() for col in required_columns]
|
|
|
+ failures: list[dict[str, Any]] = []
|
|
|
+ warnings: list[dict[str, Any]] = []
|
|
|
+ column_stats: dict[str, dict[str, Any]] = {}
|
|
|
+ warmup_rules = {str(k).strip().lower(): int(v) for k, v in (warmup_observations or DEFAULT_WARMUP_OBSERVATIONS).items()}
|
|
|
+ warmup_exempt_columns: list[dict[str, Any]] = []
|
|
|
+
|
|
|
+ for column in req:
|
|
|
+ if column not in panel.columns:
|
|
|
+ failure = {'column': column, 'reason': 'missing_required_column'}
|
|
|
+ failures.append(failure)
|
|
|
+ warnings.append(failure)
|
|
|
+ column_stats[column] = {
|
|
|
+ 'present': False,
|
|
|
+ 'non_null_count': 0,
|
|
|
+ 'unique_non_null_count': 0,
|
|
|
+ 'dominant_value_ratio': 1.0,
|
|
|
+ 'std': 0.0,
|
|
|
+ }
|
|
|
+ continue
|
|
|
+
|
|
|
+ values = pd.to_numeric(panel[column], errors='coerce').dropna()
|
|
|
+ non_null_count = int(values.shape[0])
|
|
|
+ unique_count = int(values.nunique())
|
|
|
+ std_value = float(values.std(ddof=0)) if non_null_count else 0.0
|
|
|
+ dominant_ratio = 1.0
|
|
|
+ if non_null_count:
|
|
|
+ dominant_ratio = float(values.value_counts(normalize=True, dropna=True).iloc[0])
|
|
|
+
|
|
|
+ column_stats[column] = {
|
|
|
+ 'present': True,
|
|
|
+ 'non_null_count': non_null_count,
|
|
|
+ 'unique_non_null_count': unique_count,
|
|
|
+ 'dominant_value_ratio': dominant_ratio,
|
|
|
+ 'std': std_value,
|
|
|
+ }
|
|
|
+
|
|
|
+ required_obs = int(warmup_rules.get(column, 0))
|
|
|
+ if required_obs > 0 and non_null_count < required_obs:
|
|
|
+ warmup_exempt_columns.append(
|
|
|
+ {
|
|
|
+ 'column': column,
|
|
|
+ 'required_non_null': required_obs,
|
|
|
+ 'observed_non_null': non_null_count,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ continue
|
|
|
+
|
|
|
+ if unique_count < int(min_unique_non_null):
|
|
|
+ item = {
|
|
|
+ 'column': column,
|
|
|
+ 'reason': 'low_unique_non_null',
|
|
|
+ 'observed': unique_count,
|
|
|
+ 'threshold': int(min_unique_non_null),
|
|
|
+ }
|
|
|
+ failures.append(item)
|
|
|
+ warnings.append(item)
|
|
|
+ if dominant_ratio > float(max_dominant_value_ratio):
|
|
|
+ item = {
|
|
|
+ 'column': column,
|
|
|
+ 'reason': 'dominant_value_ratio_too_high',
|
|
|
+ 'observed': dominant_ratio,
|
|
|
+ 'threshold': float(max_dominant_value_ratio),
|
|
|
+ }
|
|
|
+ failures.append(item)
|
|
|
+ warnings.append(item)
|
|
|
+ if std_value <= float(std_floor) and (
|
|
|
+ unique_count < int(min_unique_non_null) or dominant_ratio > float(max_dominant_value_ratio)
|
|
|
+ ):
|
|
|
+ item = {
|
|
|
+ 'column': column,
|
|
|
+ 'reason': 'std_below_floor',
|
|
|
+ 'observed': std_value,
|
|
|
+ 'threshold': float(std_floor),
|
|
|
+ }
|
|
|
+ failures.append(item)
|
|
|
+ warnings.append(item)
|
|
|
+
|
|
|
+ spread_stats: dict[str, Any] = {
|
|
|
+ 'present': False,
|
|
|
+ 'non_null_count': 0,
|
|
|
+ 'unique_non_null_count': 0,
|
|
|
+ 'dominant_value_ratio': 1.0,
|
|
|
+ 'std': 0.0,
|
|
|
+ }
|
|
|
+ if {'eq_weight_ret_5', 'weighted_ret_5'}.issubset(panel.columns):
|
|
|
+ spread = pd.to_numeric(panel['weighted_ret_5'], errors='coerce') - pd.to_numeric(
|
|
|
+ panel['eq_weight_ret_5'], errors='coerce'
|
|
|
+ )
|
|
|
+ spread = spread.dropna()
|
|
|
+ non_null_count = int(spread.shape[0])
|
|
|
+ unique_count = int(spread.nunique()) if non_null_count else 0
|
|
|
+ std_value = float(spread.std(ddof=0)) if non_null_count else 0.0
|
|
|
+ dominant_ratio = 1.0
|
|
|
+ if non_null_count:
|
|
|
+ dominant_ratio = float(spread.value_counts(normalize=True, dropna=True).iloc[0])
|
|
|
+
|
|
|
+ spread_stats = {
|
|
|
+ 'present': True,
|
|
|
+ 'non_null_count': non_null_count,
|
|
|
+ 'unique_non_null_count': unique_count,
|
|
|
+ 'dominant_value_ratio': dominant_ratio,
|
|
|
+ 'std': std_value,
|
|
|
+ }
|
|
|
+ spread_required_obs = int(warmup_rules.get('concentration_spread_5', 0))
|
|
|
+ if spread_required_obs > 0 and non_null_count < spread_required_obs:
|
|
|
+ warmup_exempt_columns.append(
|
|
|
+ {
|
|
|
+ 'column': 'concentration_spread_5',
|
|
|
+ 'required_non_null': spread_required_obs,
|
|
|
+ 'observed_non_null': non_null_count,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ elif unique_count < int(min_unique_non_null) or std_value <= float(std_floor):
|
|
|
+ item = {
|
|
|
+ 'column': 'concentration_spread_5',
|
|
|
+ 'reason': 'constant_or_near_constant_spread',
|
|
|
+ 'observed_unique_non_null': unique_count,
|
|
|
+ 'observed_std': std_value,
|
|
|
+ 'threshold_unique_non_null': int(min_unique_non_null),
|
|
|
+ 'threshold_std_floor': float(std_floor),
|
|
|
+ }
|
|
|
+ failures.append(item)
|
|
|
+ warnings.append(item)
|
|
|
+
|
|
|
+ blocking = bool(strict and failures)
|
|
|
+ return {
|
|
|
+ 'strict': bool(strict),
|
|
|
+ 'passed': not failures,
|
|
|
+ 'blocking': blocking,
|
|
|
+ 'thresholds': {
|
|
|
+ 'min_unique_non_null': int(min_unique_non_null),
|
|
|
+ 'max_dominant_value_ratio': float(max_dominant_value_ratio),
|
|
|
+ 'std_floor': float(std_floor),
|
|
|
+ 'warmup_observations': warmup_rules,
|
|
|
+ },
|
|
|
+ 'failures': failures,
|
|
|
+ 'warnings': warnings,
|
|
|
+ 'column_stats': column_stats,
|
|
|
+ 'spread_stats': spread_stats,
|
|
|
+ 'warmup_exempt_columns': warmup_exempt_columns,
|
|
|
+ }
|