#!/usr/bin/env python3
"""Scrape CAR's First-Time Buyer Housing Affordability Index table to CSV."""

from __future__ import annotations

import argparse
import csv
import sys
from html.parser import HTMLParser
from pathlib import Path
from typing import Iterable
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen


SOURCE_URL = "https://www.car.org/marketdata/data/ftbhai"
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)
CSV_HEADERS = [
    "State/Region/County",
    "Q42025",
    "Q32025",
    "Q42024",
    "Median Home Price",
    "Monthly Payment Including Taxes & Insurance",
    "Minimum Qualifying Income",
]


class TableParser(HTMLParser):
    """Collect plain-text cell values from the first HTML table fragment."""

    def __init__(self) -> None:
        super().__init__()
        self.in_tr = False
        self.in_cell = False
        self.rows: list[list[str]] = []
        self.current_row: list[str] = []
        self.current_parts: list[str] = []

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        if tag == "tr":
            self.in_tr = True
            self.current_row = []
        elif self.in_tr and tag in {"td", "th"}:
            self.in_cell = True
            self.current_parts = []

    def handle_endtag(self, tag: str) -> None:
        if self.in_tr and self.in_cell and tag in {"td", "th"}:
            text = " ".join("".join(self.current_parts).replace("\xa0", " ").split())
            self.current_row.append(text)
            self.in_cell = False
            self.current_parts = []
        elif self.in_tr and tag == "tr":
            if self.current_row:
                self.rows.append(self.current_row)
            self.in_tr = False

    def handle_data(self, data: str) -> None:
        if self.in_cell:
            self.current_parts.append(data)


def fetch_html(url: str, timeout: float) -> str:
    request = Request(url, headers={"User-Agent": USER_AGENT})
    with urlopen(request, timeout=timeout) as response:
        return response.read().decode("utf-8", errors="ignore")


def extract_first_table(html: str) -> str:
    table_start = html.find("<table")
    if table_start == -1:
        raise ValueError("No <table> tag found in the page HTML.")

    table_end = html.find("</table>", table_start)
    if table_end == -1:
        raise ValueError("Found a <table> tag but not a closing </table> tag.")

    return html[table_start : table_end + len("</table>")]


def normalize_rows(raw_rows: Iterable[list[str]]) -> list[list[str]]:
    normalized: list[list[str]] = []

    for raw in raw_rows:
        if len(raw) < 9:
            continue

        row = [raw[0], raw[1], raw[2], raw[4], raw[6], raw[7], raw[8]]
        if not any(cell.strip() for cell in row):
            continue

        normalized.append(row)

    return normalized


def scrape_ftbhai(url: str, timeout: float) -> list[list[str]]:
    html = fetch_html(url, timeout)
    table_html = extract_first_table(html)

    parser = TableParser()
    parser.feed(table_html)

    # The first three rows are the title row, header row, and a blank spacer row.
    return normalize_rows(parser.rows[3:])


def write_csv(path: Path, rows: list[list[str]]) -> None:
    with path.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.writer(handle)
        writer.writerow(CSV_HEADERS)
        writer.writerows(rows)


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description=(
            "Scrape CAR's First-Time Buyer Housing Affordability Index table and "
            "write it to a CSV file."
        )
    )
    parser.add_argument(
        "-o",
        "--output",
        default="ftbhai.csv",
        help="Output CSV path. Defaults to ./ftbhai.csv",
    )
    parser.add_argument(
        "--url",
        default=SOURCE_URL,
        help=f"Source page URL. Defaults to {SOURCE_URL}",
    )
    parser.add_argument(
        "--timeout",
        type=float,
        default=20.0,
        help="HTTP timeout in seconds. Defaults to 20.",
    )
    parser.add_argument(
        "--stdout",
        action="store_true",
        help="Also print the CSV to stdout after writing the file.",
    )
    return parser


def main() -> int:
    parser = build_parser()
    args = parser.parse_args()

    try:
        rows = scrape_ftbhai(args.url, args.timeout)
    except HTTPError as exc:
        print(f"HTTP error while fetching {args.url}: {exc.code} {exc.reason}", file=sys.stderr)
        return 1
    except URLError as exc:
        print(f"Network error while fetching {args.url}: {exc.reason}", file=sys.stderr)
        return 1
    except ValueError as exc:
        print(f"Parse error: {exc}", file=sys.stderr)
        return 1

    output_path = Path(args.output).expanduser().resolve()
    write_csv(output_path, rows)

    print(f"Wrote {len(rows)} rows to {output_path}")

    if args.stdout:
        writer = csv.writer(sys.stdout)
        writer.writerow(CSV_HEADERS)
        writer.writerows(rows)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())