"""
Valan — sample loader  (fin_awards schema)
==========================================
Public evaluation sample · Feed snapshot 2026-05-30

Loads the Valan procurement sample and exposes its fields with the point-in-time
(PIT) and compliance rules applied HONESTLY. Column reference:
Valan_Data_Dictionary_TwoSigma.md  ->  the `fin_awards` table.

This sample carries the full fin_awards identity + financial block:
  financial_quality_score, is_framework, ticker_confidence, ticker_exchange,
  ticker_mic, ultimate_parent_lei, linked_tender_id, linked_prime_id.

SCOPE — investable-only slice
  Every row here has investable_flag=True. This is a curated *investable* showcase.
  In the full universe only ~17% (12.3M of 71.9M awards) are investable. Do NOT
  extrapolate coverage from this file.

POINT-IN-TIME / forward-bias — read before backtesting
  * ticker_as_of WITH pit_confirmed=True  ->  the ticker AS OF award_date. BACKTEST-SAFE.
  * ultimate_parent_ticker (parent rollup) ->  the CURRENT ownership link (who owns the
       supplier today), NOT as-of the award. Identity/screening info; NOT forward-bias-free.
  In this US-heavy slice ~772 / 1000 rows are pit_confirmed (backtest-grade); the remainder
  are investable only via current own-ticker or the parent rollup. `investable(use_pit=True)`
  returns the PIT set only.
  Caveats on rollup rows: some parents resolve to foreign/secondary lines
  (e.g. Fresenius -> 0OO9.LSE rather than FRE.DE).

Requires: pip install pandas pyarrow      (duckdb optional, used if present)
Usage:    python valan_sample_loader.py [path_to_parquet_or_csv]
"""
import os, sys, warnings
import pandas as pd
# explicit casts only (no silent-downcast surprises); guarded for pandas version drift
try:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pd.set_option("future.no_silent_downcasting", True)
except (KeyError, Exception):
    pass

DEFAULT = "Valan_TwoSigma_sample_1k_fin_awards.parquet"


def _as_bool(df: pd.DataFrame, col: str) -> pd.Series:
    """Coerce a column (native bool, or 'True'/'False' strings from CSV) to clean bool."""
    if col not in df.columns:
        return pd.Series(False, index=df.index)
    s = df[col]
    if s.dtype == object:
        return s.map(lambda x: str(x).strip().lower() in ("true", "1", "t", "yes")).astype(bool)
    return s.fillna(False).astype(bool)


def load(path: str = DEFAULT) -> pd.DataFrame:
    """Load the sample and add derived columns, keeping PIT and current strictly separate."""
    if not os.path.isabs(path):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
    df = pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)

    # fin_awards: award_value is DOUBLE, award_date is DATE — coerce defensively, keep originals.
    df["award_value_num"] = pd.to_numeric(df["award_value"], errors="coerce")
    df["award_date"] = pd.to_datetime(df["award_date"], errors="coerce")
    pit_ok = _as_bool(df, "pit_confirmed")

    # PIT-SAFE ticker — own listing, ONLY where pit_confirmed. Nothing else is forward-bias-free.
    df["ticker_pit"] = df["ticker_as_of"].where(pit_ok & df["ticker_as_of"].notna())
    # CURRENT, NON-PIT exposure (screening only — has look-ahead):
    df["parent_rollup_ticker_current"] = df.get("ultimate_parent_ticker")  # current ownership link
    df["ticker_current_or_rollup"] = df["ticker_current"].where(
        df["ticker_current"].notna(), df.get("ultimate_parent_ticker"))
    # primes carry tier_level 0 per the dictionary; coerce non-subcontract/missing -> 0
    if "tier_level" in df.columns:
        df["tier_level"] = pd.to_numeric(df["tier_level"], errors="coerce").where(
            _as_bool(df, "is_subcontract"), 0).fillna(0).astype(int)
    return df


def obligated_awards(df: pd.DataFrame) -> pd.DataFrame:
    """Real obligated awards: value>0, exclude framework/IDIQ ceilings and de-obligations."""
    return df[(df.get("value_type", "award") == "award")
              & (~_as_bool(df, "value_is_ceiling"))
              & (df["award_value_num"] > 0)]


def investable(df: pd.DataFrame, use_pit: bool = True) -> pd.DataFrame:
    """Tradeable rows.
    use_pit=True  -> ONLY pit_confirmed rows (forward-bias-free; ~772 here). USE FOR BACKTESTS.
    use_pit=False -> current tradeable incl. parent rollup (~1000). SCREENING ONLY — has look-ahead.
    """
    col = "ticker_pit" if use_pit else "ticker_current_or_rollup"
    return df[df[col].notna()]


if __name__ == "__main__":
    path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT
    df = load(path)
    print(f"loaded {len(df):,} rows × {df.shape[1]} cols  from  {os.path.basename(path)}")
    print( "  SCOPE: investable-only slice (full universe is ~17% investable) — do not extrapolate coverage")
    print(f"  date range:               {df['award_date'].min().date()} .. {df['award_date'].max().date()}")
    print(f"  currencies (local, never sum across): {sorted(df['currency'].dropna().unique())}")
    print(f"  PIT-SAFE rows (pit_confirmed):          {investable(df, use_pit=True).shape[0]:,}   <- backtest-safe")
    print(f"  current tradeable (incl parent rollup): {investable(df, use_pit=False).shape[0]:,}   <- screening only (look-ahead)")
    print(f"  obligated awards (ceilings excluded):   {obligated_awards(df).shape[0]:,}")
    print(f"  distinct PIT tickers:     {df['ticker_pit'].nunique()}   |  distinct current tickers: {df['ticker_current_or_rollup'].nunique()}")
    print(f"  buyer countries:          {df['buyer_country'].value_counts().head(6).to_dict()}")
    print("\n  top CURRENT tradeable tickers by obligated-award count (screening view, NOT a backtest):")
    g = (obligated_awards(investable(df, use_pit=False))
         .groupby('ticker_current_or_rollup')
         .agg(awards=('award_value_num', 'count'),
              currencies=('currency', lambda s: ','.join(sorted(s.dropna().unique()))),
              pit_rows=('ticker_pit', lambda s: int(s.notna().sum())))
         .sort_values('awards', ascending=False).head(8))
    for tk, r in g.iterrows():
        print(f"    {tk:<10} {int(r['awards']):>4} awards   pit-safe:{int(r['pit_rows']):>3}   [{r['currencies']}]")
    print("\n  ^ 'pit-safe' = how many of those rows are forward-bias-free. Backtest only those.")

    try:
        import duckdb
        con = duckdb.connect(); con.register("awards", df)
        print("\n  duckdb view 'awards' registered. PIT-safe backtest set:")
        print("    SELECT * FROM awards WHERE ticker_pit IS NOT NULL")
    except ImportError:
        pass
