"""Parse Pertamina transaction detail PDFs (English and Indonesian formats)."""

from __future__ import annotations

import io
import re
from dataclasses import dataclass, field
from datetime import date, datetime
from decimal import Decimal, InvalidOperation


@dataclass
class ParsedItem:
    kode_item_pertamina: str
    product_name: str  # raw name from PDF
    volume_liter: Decimal
    no_so: str | None = None
    tanggal_kirim: date | None = None
    sent_to_text: str | None = None


@dataclass
class ParsedPDF:
    booking_code: str
    tanggal: date
    items: list[ParsedItem] = field(default_factory=list)
    subtotal: Decimal = Decimal("0")
    discount: Decimal = Decimal("0")
    ppn: Decimal = Decimal("0")
    pph22: Decimal = Decimal("0")
    pbbkb: Decimal = Decimal("0")
    rounding: Decimal = Decimal("0")
    debit_credit: Decimal = Decimal("0")
    total: Decimal = Decimal("0")
    warnings: list[str] = field(default_factory=list)


def _parse_decimal(text: str) -> Decimal:
    """Parse a number string that may use Indonesian format (dot=thousands, comma=decimal)."""
    text = text.strip().replace(" ", "")
    # Remove currency suffix like "IDR"
    text = re.sub(r'\s*IDR\s*$', '', text, flags=re.IGNORECASE)
    # Handle negative with dash at end or start
    negative = False
    if text.startswith("-") or text.startswith("("):
        negative = True
        text = text.lstrip("-(").rstrip(")")
    if text.startswith("-"):
        negative = True
        text = text[1:]

    # Determine format: if there are dots and commas, use Indonesian convention
    # If only dots, check if it could be thousands separator
    if "," in text and "." in text:
        # Indonesian: 84.827.584 or 84,827,584 — find which is thousands separator
        last_dot = text.rfind(".")
        last_comma = text.rfind(",")
        if last_comma > last_dot:
            # Dot is thousands, comma is decimal: 84.827,584
            text = text.replace(".", "").replace(",", ".")
        else:
            # Comma is thousands, dot is decimal: 84,827.584
            text = text.replace(",", "")
    elif "." in text:
        # Could be 8.000 (Indonesian thousands) or 8.5 (decimal)
        # Check pattern: if all groups of 3 after dots → thousands separator
        parts = text.split(".")
        if len(parts) > 1 and all(len(p) == 3 for p in parts[1:]):
            text = text.replace(".", "")
        # else treat dot as decimal point
    elif "," in text:
        # Multiple commas = thousands separators (e.g. 68,965,520)
        # Single comma followed by 3 digits = thousands (e.g. 2,560)
        # Single comma followed by 1-2 digits = decimal (e.g. 1,5)
        parts = text.split(",")
        if len(parts) > 1 and all(len(p) == 3 for p in parts[1:]):
            text = text.replace(",", "")   # strip thousands separators
        else:
            text = text.replace(",", ".")  # treat as decimal separator

    try:
        val = Decimal(text)
        return -val if negative else val
    except InvalidOperation:
        return Decimal("0")


def _parse_date(text: str) -> date | None:
    """Parse dates like '29 March 2026', '02 April 2026', '2026-03-30'."""
    text = text.strip()
    # ISO format: 2026-03-30
    if re.match(r'^\d{4}-\d{2}-\d{2}$', text):
        return datetime.strptime(text, "%Y-%m-%d").date()
    # English: 29 March 2026
    for fmt in ("%d %B %Y", "%d %b %Y"):
        try:
            return datetime.strptime(text, fmt).date()
        except ValueError:
            continue
    id_months = {
        "januari": 1, "februari": 2, "maret": 3, "april": 4,
        "mei": 5, "juni": 6, "juli": 7, "agustus": 8,
        "september": 9, "oktober": 10, "november": 11, "desember": 12,
    }
    en_months = {
        "january": 1, "february": 2, "march": 3, "april": 4,
        "may": 5, "june": 6, "july": 7, "august": 8,
        "september": 9, "october": 10, "november": 11, "december": 12,
        "jan": 1, "feb": 2, "mar": 3, "apr": 4,
        "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
    }
    all_months = {**id_months, **en_months}

    # With spaces: "02 April 2026"
    m = re.match(r'(\d{1,2})\s+(\w+)\s+(\d{4})', text)
    if m:
        day, month_str, year = int(m.group(1)), m.group(2).lower(), int(m.group(3))
        if month_str in all_months:
            return date(year, all_months[month_str], day)

    # No spaces: "02April2026"
    m = re.match(r'(\d{1,2})([A-Za-z]+)(\d{4})', text)
    if m:
        day, month_str, year = int(m.group(1)), m.group(2).lower(), int(m.group(3))
        if month_str in all_months:
            return date(year, all_months[month_str], day)

    return None


def _extract_text(pdf_bytes: bytes) -> str:
    """Extract full text from PDF using pdfplumber."""
    import pdfplumber
    text_parts = []
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_parts.append(page_text)
    return "\n".join(text_parts)


def _extract_tables(pdf_bytes: bytes) -> list[list[list[str]]]:
    """Extract tables from PDF using pdfplumber."""
    import pdfplumber
    all_tables = []
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            if tables:
                all_tables.extend(tables)
    return all_tables


def parse_pertamina_pdf(pdf_bytes: bytes) -> ParsedPDF:
    """Parse a Pertamina transaction detail PDF and return structured data."""
    text = _extract_text(pdf_bytes)
    if not text.strip():
        raise ValueError("PDF tidak mengandung teks yang bisa dibaca")

    # Detect language (handle both spaced and no-space variants)
    text_norm = re.sub(r'\s+', '', text).lower()
    is_indonesian = "kodebooking" in text_norm or "terbilang" in text_norm or "tanggalkirim" in text_norm

    # Extract booking code — handle "KodeBooking" (no space) and "Kode Booking"
    booking_code = ""
    bc_match = re.search(
        r'Kode\s*Booking\s*[:\s]*([A-Za-z0-9][A-Za-z0-9\-/]*)',
        text, re.IGNORECASE
    )
    if bc_match:
        booking_code = bc_match.group(1).strip()
    if not booking_code:
        bc_match = re.search(
            r'Booking\s*Code\s*[:\s]*([A-Za-z0-9][A-Za-z0-9\-/]*)',
            text, re.IGNORECASE
        )
        if bc_match:
            booking_code = bc_match.group(1).strip()
    if not booking_code:
        raise ValueError("Tidak dapat menemukan Booking Code / Kode Booking di PDF")

    # Extract date — handle "02April2026" (no spaces) and "02 April 2026"
    tanggal = None
    date_match = re.search(
        r'(?:^|\s)Tanggal\s*[:\s]*(\d{1,2}\s*[A-Za-z]+\s*\d{4})',
        text, re.MULTILINE
    )
    if not date_match:
        date_match = re.search(
            r'(?:^|\s)Date\s*[:\s]*(\d{1,2}\s*[A-Za-z]+\s*\d{4})',
            text, re.MULTILINE
        )
    if date_match:
        tanggal = _parse_date(date_match.group(1).strip())
    if tanggal is None:
        # Fallback: first parseable date in the whole text
        for m in re.finditer(r'(\d{1,2}\s*[A-Za-z]+\s*\d{4})', text):
            d = _parse_date(m.group(1))
            if d:
                tanggal = d
                break
    if tanggal is None:
        raise ValueError("Tidak dapat menemukan tanggal transaksi di PDF")

    result = ParsedPDF(booking_code=booking_code, tanggal=tanggal)

    # Extract cost breakdown from text
    cost_patterns = {
        "subtotal": [r'(?:Sub\s*total|Subtotal)\s+([\d.,\-]+)\s*IDR'],
        "discount": [r'(?:Discount|Diskon)\s+([\d.,\-]+)\s*IDR'],
        "ppn": [r'(?:VAT|PPN)\s+([\d.,\-]+)\s*IDR'],
        "pph22": [r'(?:Income\s*Tax|PPh\s*22)\s+([\d.,\-]+)\s*IDR'],
        "pbbkb": [r'(?:MVFT|PBBKB)\s+([\d.,\-]+)\s*IDR'],
        "rounding": [r'(?:Rounding|Pembulatan)\s+([\d.,\-]+)\s*IDR'],
        "debit_credit": [r'(?:Debit\s*(?:or|/)\s*Credit|Debet\s*/\s*Credit)\s+([\d.,\-]+)\s*IDR'],
        "total": [r'^Total\s+([\d.,\-]+)\s*IDR'],
    }

    for field_name, patterns in cost_patterns.items():
        for pattern in patterns:
            m = re.search(pattern, text, re.MULTILINE | re.IGNORECASE)
            if m:
                setattr(result, field_name, _parse_decimal(m.group(1)))
                break

    # Extract line items from tables
    tables = _extract_tables(pdf_bytes)
    for table in tables:
        if not table or len(table) < 2:
            continue
        # Find header row (normalise whitespace for matching)
        header_row = None
        header_idx = -1
        for i, row in enumerate(table):
            row_norm = re.sub(r'\s+', '', " ".join((c or "") for c in row)).lower()
            if "item" in row_norm and ("qty" in row_norm or "quantity" in row_norm):
                header_row = row
                header_idx = i
                break

        if header_row is None:
            continue

        # Map column positions (handle both spaced and no-space variants from pdfplumber)
        cols = {}
        for j, cell in enumerate(header_row):
            # Normalise: collapse whitespace, lowercase
            cell_norm = re.sub(r'\s+', '', (cell or "")).lower()
            if cell_norm.startswith("item"):
                cols["item"] = j
            elif cell_norm in ("qty", "quantity"):
                cols["qty"] = j
            elif cell_norm == "uom":
                cols["uom"] = j
            elif "no.so" in cell_norm or cell_norm == "so" or cell_norm.endswith("so"):
                cols["so"] = j
            elif "tanggalkirim" in cell_norm or "sentdate" in cell_norm:
                cols["sent_date"] = j
            elif "dikirimke" in cell_norm or "sentto" in cell_norm:
                cols["sent_to"] = j

        if "item" not in cols or "qty" not in cols:
            continue

        # Parse data rows
        for row in table[header_idx + 1:]:
            if not row or len(row) <= max(cols.values()):
                continue

            item_cell = (row[cols["item"]] or "").strip()
            qty_cell = (row[cols["qty"]] or "").strip()

            if not item_cell or not qty_cell:
                continue
            # Skip summary rows
            if any(kw in item_cell.lower() for kw in ("subtotal", "total", "amount", "terbilang")):
                continue

            # Parse item code and product name
            # Format: "A040900006 - PERTAMAX,BULK" or "A040900076 - PERTALITE"
            kode = ""
            product_name = item_cell
            item_match = re.match(r'(A\d+)\s*[-–]\s*(.+)', item_cell, re.DOTALL)
            if item_match:
                kode = item_match.group(1).strip()
                product_name = item_match.group(2).strip()
            # Clean up product name: remove ",BULK", newlines, extra spaces
            product_name = re.sub(r',\s*BULK', '', product_name)
            product_name = re.sub(r'\s+', ' ', product_name).strip()

            # Parse volume
            volume_raw = _parse_decimal(qty_cell)
            uom = ""
            if "uom" in cols:
                uom = (row[cols["uom"]] or "").strip().upper()
            if uom == "KL":
                # If parsed value > 999 it was treated as Indonesian thousands
                # (e.g. "8.000" → 8000) — that IS already litres; don't re-multiply.
                # If parsed value <= 999 it's a true KL value → convert to litres.
                if volume_raw > Decimal("999"):
                    volume_liter = volume_raw  # already in litres
                else:
                    volume_liter = volume_raw * Decimal("1000")
            elif volume_raw < Decimal("100"):
                # No UoM but small number — assume KL
                volume_liter = volume_raw * Decimal("1000")
            else:
                volume_liter = volume_raw

            # Parse SO number
            no_so = None
            if "so" in cols:
                so_val = (row[cols["so"]] or "").strip()
                if so_val and so_val != "":
                    no_so = so_val

            # Parse sent date
            tanggal_kirim = None
            if "sent_date" in cols:
                date_val = (row[cols["sent_date"]] or "").strip()
                if date_val:
                    tanggal_kirim = _parse_date(date_val)

            # Parse sent to
            sent_to_text = None
            if "sent_to" in cols:
                sent_val = (row[cols["sent_to"]] or "").strip()
                if sent_val:
                    sent_to_text = re.sub(r'\s+', ' ', sent_val).strip()

            result.items.append(ParsedItem(
                kode_item_pertamina=kode,
                product_name=product_name,
                volume_liter=volume_liter,
                no_so=no_so,
                tanggal_kirim=tanggal_kirim,
                sent_to_text=sent_to_text,
            ))

    if not result.items:
        result.warnings.append("Tidak ada line item yang berhasil di-parse dari PDF")

    # Use SO from first item if available and not set at header level
    if result.items and result.items[0].no_so:
        for item in result.items:
            if item.no_so and item.no_so != result.items[0].no_so:
                break
        else:
            # All items have same SO
            pass

    return result
