#!/usr/bin/env python3
"""Content-adaptive sprite downscaler (Kopf-Shamir-Peers 2013).

Downscales hand-drawn sprites into Genesis-compatible indexed-color pixel art
using bilateral EM kernels that respect edges and color boundaries.

Dependencies: pip install numpy scikit-image Pillow
"""

import argparse
import sys
import numpy as np
from pathlib import Path

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

GENESIS_LEVELS = np.array([0, 36, 72, 109, 145, 182, 218, 255], dtype=np.uint8)


# ---------------------------------------------------------------------------
# Color utilities
# ---------------------------------------------------------------------------

def rgb_to_lab(rgb: np.ndarray) -> np.ndarray:
    """Convert float RGB [0,1] image to CIELAB. Uses skimage."""
    from skimage.color import rgb2lab
    return rgb2lab(rgb)


def lab_to_rgb(lab: np.ndarray) -> np.ndarray:
    """Convert CIELAB image to float RGB [0,1]. Uses skimage."""
    from skimage.color import lab2rgb
    return lab2rgb(lab)


def normalize_lab(lab: np.ndarray) -> np.ndarray:
    """Normalize CIELAB to roughly [0,1] per channel (matching paper)."""
    out = np.empty_like(lab)
    out[..., 0] = lab[..., 0] / 100.0
    out[..., 1] = (lab[..., 1] + 87.0) / 186.0
    out[..., 2] = (lab[..., 2] + 108.0) / 203.0
    return out


def denormalize_lab(nlab: np.ndarray) -> np.ndarray:
    """Undo normalize_lab."""
    out = np.empty_like(nlab)
    out[..., 0] = nlab[..., 0] * 100.0
    out[..., 1] = nlab[..., 1] * 186.0 - 87.0
    out[..., 2] = nlab[..., 2] * 203.0 - 108.0
    return out


# ---------------------------------------------------------------------------
# Image I/O
# ---------------------------------------------------------------------------

def load_input(path: str, bg_color=None):
    """Load PNG, return (normalized_lab, alpha_mask)."""
    from PIL import Image

    img = Image.open(path)
    if img.mode == "RGBA":
        arr = np.array(img, dtype=np.float64)
        alpha_mask = arr[:, :, 3] > 127
        rgb01 = arr[:, :, :3] / 255.0
    elif img.mode == "RGB":
        arr = np.array(img, dtype=np.float64)
        rgb01 = arr / 255.0
        if bg_color is not None:
            bg = np.array(bg_color, dtype=np.float64) / 255.0
            alpha_mask = ~np.all(np.abs(rgb01 - bg) < 1e-3, axis=-1)
        else:
            alpha_mask = np.ones(rgb01.shape[:2], dtype=bool)
    else:
        img = img.convert("RGBA")
        arr = np.array(img, dtype=np.float64)
        alpha_mask = arr[:, :, 3] > 127
        rgb01 = arr[:, :, :3] / 255.0

    lab = rgb_to_lab(rgb01)
    nlab = normalize_lab(lab)
    return nlab, alpha_mask


def load_palette(path: str):
    """Load a 16-color palette from an indexed PNG.

    Returns (16, 3) uint8 RGB array.
    """
    from PIL import Image

    img = Image.open(path)
    if img.mode != "P":
        print(f"Error: palette image must be indexed-color (mode P), got {img.mode}",
              file=sys.stderr)
        sys.exit(1)

    raw_pal = img.getpalette()
    if raw_pal is None:
        print("Error: palette image has no palette data", file=sys.stderr)
        sys.exit(1)

    pal = np.array(raw_pal[:48], dtype=np.uint8).reshape(16, 3)
    return pal


def save_indexed_png(path: str, pixels: np.ndarray, palette: np.ndarray):
    """Write indexed-color PNG with the palette exactly as provided."""
    from PIL import Image

    img = Image.fromarray(pixels, mode="P")
    flat_pal = palette.flatten().tolist()
    flat_pal.extend([0] * (768 - len(flat_pal)))
    img.putpalette(flat_pal)
    img.save(path)


# ---------------------------------------------------------------------------
# EM core
# ---------------------------------------------------------------------------

def _invert_2x2(m):
    """Invert a single 2x2 matrix."""
    a, b, c, d = m[0, 0], m[0, 1], m[1, 0], m[1, 1]
    det = a * d - b * c
    if abs(det) < 1e-15:
        det = 1e-15
    return np.array([[d, -b], [-c, a]], dtype=np.float64) / det


def run_downscale(nlab_image, alpha_mask, out_h, out_w, n_iters=30,
                  verbose=False):
    """Run full EM downscale with proper per-pixel normalization.

    nlab_image: (H, W, 3) normalized CIELAB [0,1]
    alpha_mask: (H, W) bool

    Returns (output_nlab, output_alpha).
    """
    in_h, in_w = nlab_image.shape[:2]
    K = out_h * out_w
    N = in_h * in_w
    rx = in_w / out_w
    ry = in_h / out_h

    # Flatten input for indexing
    nlab_flat = nlab_image.reshape(N, 3)
    alpha_flat = alpha_mask.ravel()

    # Precompute all input pixel coordinates (row, col) in input space
    grid_y, grid_x = np.mgrid[:in_h, :in_w]
    coords_flat = np.stack([grid_x.ravel(), grid_y.ravel()], axis=-1).astype(np.float64)

    # --- Initialization ---

    # Output pixel centers in input coordinates
    oy, ox = np.meshgrid(
        (np.arange(out_h) + 0.5) * ry,
        (np.arange(out_w) + 0.5) * rx,
        indexing="ij",
    )
    centers = np.stack([ox.ravel(), oy.ravel()], axis=-1)  # (K, 2)

    mu = centers.copy()
    # Covariance: diag(rx/3, ry/3) — NOT squared, matching paper
    Sigma = np.zeros((K, 2, 2), dtype=np.float64)
    Sigma[:, 0, 0] = rx / 3.0
    Sigma[:, 1, 1] = ry / 3.0

    # Color mean: local average from input
    nu = np.full((K, 3), 0.5, dtype=np.float64)
    for k in range(K):
        cx, cy = centers[k]
        x0 = max(0, int(cx - rx / 2))
        x1 = min(in_w, int(cx + rx / 2) + 1)
        y0 = max(0, int(cy - ry / 2))
        y1 = min(in_h, int(cy + ry / 2) + 1)
        patch = nlab_image[y0:y1, x0:x1]
        mask_patch = alpha_mask[y0:y1, x0:x1]
        if mask_patch.any():
            nu[k] = patch[mask_patch].mean(axis=0)

    # Color variance (scalar), matching paper init
    sigma_c = np.full(K, 0.0001, dtype=np.float64)

    # Precompute R(k): set of input pixel indices within 2*rx of each kernel
    R = [None] * K
    for k in range(K):
        cx, cy = centers[k]
        x0 = max(0, int(cx - 2 * rx))
        x1 = min(in_w, int(cx + 2 * rx))
        y0 = max(0, int(cy - 2 * ry))
        y1 = min(in_h, int(cy + 2 * ry))
        yy, xx = np.mgrid[y0:y1, x0:x1]
        R[k] = (yy.ravel() * in_w + xx.ravel()).astype(np.int32)

    # Eigenvalue clamp bounds, scaled with downscale ratio
    # For rx=2 matches paper's [0.05, 0.1]; scales quadratically
    ev_min = 0.0125 * rx * rx
    ev_max = 0.025 * rx * rx

    # --- EM iterations ---
    prev_nu = None

    for it in range(n_iters):
        # ===================== E-STEP =====================
        # Compute bilateral weights w_k(i) and per-pixel sums for normalization

        # Per-pixel accumulator: sum of w_k(i) across all kernels k
        pixel_wsum = np.zeros(N, dtype=np.float64)

        # Store per-kernel weights (sparse: only pixels in R(k))
        kernel_w = [None] * K

        for k in range(K):
            idx = R[k]
            pi = coords_flat[idx]         # (M, 2)
            ci = nlab_flat[idx]           # (M, 3)
            ai = alpha_flat[idx]          # (M,) bool

            # Spatial Gaussian: Mahalanobis distance
            diff_s = pi - mu[k]
            Si = _invert_2x2(Sigma[k])
            mahal = np.sum(diff_s @ Si * diff_s, axis=-1)
            f_k = np.exp(-0.5 * mahal)

            # Color Gaussian
            diff_c = ci - nu[k]
            color_dist_sq = np.sum(diff_c ** 2, axis=-1)
            sc2 = max(sigma_c[k] * sigma_c[k], 1e-15)
            g_k = np.exp(-color_dist_sq / (2.0 * sc2))

            # Bilateral weight
            w = f_k * g_k
            w[~ai] = 0.0

            # Per-kernel normalization (numerical stability)
            wsum = w.sum()
            if wsum > 0:
                w /= wsum

            kernel_w[k] = w
            pixel_wsum[idx] += w

        # ===================== COMPUTE GAMMA & M-STEP =====================
        # gamma_k(i) = w_k(i) / sum_n w_n(i)  [per-pixel normalization]

        new_mu = np.zeros((K, 2), dtype=np.float64)
        new_Sigma = np.zeros((K, 2, 2), dtype=np.float64)
        new_nu = np.zeros((K, 3), dtype=np.float64)

        # Also store gamma for shape constraint overlap check
        kernel_gamma = [None] * K

        for k in range(K):
            idx = R[k]
            w = kernel_w[k]
            pi = coords_flat[idx]
            ci = nlab_flat[idx]

            # Per-pixel normalization
            denom = pixel_wsum[idx]
            denom = np.where(denom > 0, denom, 1.0)
            gamma = w / denom

            kernel_gamma[k] = gamma

            gamma_sum = gamma.sum()
            if gamma_sum < 1e-12:
                new_mu[k] = centers[k]
                new_Sigma[k] = Sigma[k]
                new_nu[k] = nu[k]
                continue

            # M-step
            new_mu[k] = (gamma[:, None] * pi).sum(axis=0) / gamma_sum
            diff_s = pi - new_mu[k]
            new_Sigma[k] = (gamma[:, None, None] *
                            (diff_s[:, :, None] * diff_s[:, None, :])).sum(axis=0) / gamma_sum
            new_nu[k] = (gamma[:, None] * ci).sum(axis=0) / gamma_sum

        mu = new_mu
        Sigma = new_Sigma
        nu = new_nu

        # ===================== CORRECTION STEP =====================

        # 1. Spatial bias: blend mu toward 4-neighbor average, clamp to box
        mu_grid = mu.reshape(out_h, out_w, 2)
        neighbor_sum = np.zeros_like(mu_grid)
        neighbor_cnt = np.zeros((out_h, out_w, 1), dtype=np.float64)

        neighbor_sum[1:, :] += mu_grid[:-1, :]
        neighbor_cnt[1:, :] += 1
        neighbor_sum[:-1, :] += mu_grid[1:, :]
        neighbor_cnt[:-1, :] += 1
        neighbor_sum[:, 1:] += mu_grid[:, :-1]
        neighbor_cnt[:, 1:] += 1
        neighbor_sum[:, :-1] += mu_grid[:, 1:]
        neighbor_cnt[:, :-1] += 1

        neighbor_cnt = np.maximum(neighbor_cnt, 1)
        mu_bar = neighbor_sum / neighbor_cnt

        mu_grid = 0.5 * mu_grid + 0.5 * mu_bar

        centers_grid = centers.reshape(out_h, out_w, 2)
        mu_grid[..., 0] = np.clip(mu_grid[..., 0],
                                   centers_grid[..., 0] - rx / 4.0,
                                   centers_grid[..., 0] + rx / 4.0)
        mu_grid[..., 1] = np.clip(mu_grid[..., 1],
                                   centers_grid[..., 1] - ry / 4.0,
                                   centers_grid[..., 1] + ry / 4.0)
        mu = mu_grid.reshape(K, 2)

        # 2. Constrain spatial covariance via SVD eigenvalue clamping
        for k in range(K):
            U, s, Vt = np.linalg.svd(Sigma[k])
            s = np.clip(s, ev_min, ev_max)
            Sigma[k] = U @ np.diag(s) @ Vt

        # 3. Shape constraint: check directional variance AND kernel overlap
        ky_all = np.arange(K) // out_w
        kx_all = np.arange(K) % out_w

        for k in range(K):
            ky, kx = ky_all[k], kx_all[k]
            gamma_k = kernel_gamma[k]
            idx_k = R[k]

            for dy in range(-1, 2):
                for dx in range(-1, 2):
                    if dy == 0 and dx == 0:
                        continue
                    ny, nx = ky + dy, kx + dx
                    if not (0 <= ny < out_h and 0 <= nx < out_w):
                        continue
                    nk = ny * out_w + nx

                    # Directional variance check
                    d = mu[nk] - mu[k]
                    d_norm = np.linalg.norm(d)
                    if d_norm > 1e-8:
                        d_hat = d / d_norm
                        # Weighted directional variance
                        pi = coords_flat[idx_k]
                        proj = np.maximum(0, (pi - mu[k]) @ d_hat)
                        s = (gamma_k * proj * proj).sum()
                    else:
                        s = 0.0

                    # Kernel overlap check
                    # Find common pixels between R(k) and R(nk)
                    idx_n = R[nk]
                    gamma_n = kernel_gamma[nk]
                    common, k_pos, n_pos = np.intersect1d(idx_k, idx_n,
                                                          return_indices=True)
                    if len(common) > 0:
                        f = (gamma_k[k_pos] * gamma_n[n_pos]).sum()
                    else:
                        f = 0.0

                    if s > 0.2 * rx or f < 0.08:
                        sigma_c[k] *= 1.1
                        sigma_c[nk] *= 1.1

        # Convergence check (after 30 iterations, check color stability)
        if prev_nu is not None and it >= 30:
            max_delta = np.max(np.abs(nu - prev_nu))
            if verbose:
                print(f"  iter {it+1}/{n_iters}: max color delta = {max_delta:.6f}")
            if max_delta < 0.002:
                if verbose:
                    print(f"  Converged at iteration {it+1}")
                break
        elif verbose:
            mu_shift = np.linalg.norm(mu - centers, axis=-1).mean()
            print(f"  iter {it+1}/{n_iters}: mean spatial shift = {mu_shift:.4f} px")

        prev_nu = nu.copy()

    # --- Build output ---
    output_nlab = nu.reshape(out_h, out_w, 3)

    # Transparency: check opaque fraction per cell
    output_alpha = np.ones((out_h, out_w), dtype=bool)
    for k in range(K):
        ky, kx = ky_all[k], kx_all[k]
        cx, cy = centers[k]
        x0 = max(0, int(cx - rx / 2))
        x1 = min(in_w, int(cx + rx / 2) + 1)
        y0 = max(0, int(cy - ry / 2))
        y1 = min(in_h, int(cy + ry / 2) + 1)
        patch_alpha = alpha_mask[y0:y1, x0:x1]
        total = patch_alpha.size
        opaque = patch_alpha.sum()
        if total > 0 and (opaque / total) < 0.3:
            output_alpha[ky, kx] = False

    return output_nlab, output_alpha


# ---------------------------------------------------------------------------
# Palette assignment
# ---------------------------------------------------------------------------

def assign_to_palette(output_nlab, output_alpha, palette_rgb):
    """Assign each output pixel to the nearest color in the provided palette.

    Matches in CIELAB space. Palette is used exactly as-is.
    """
    H, W = output_nlab.shape[:2]

    # Convert palette to normalized CIELAB for comparison
    pal_rgb01 = palette_rgb.astype(np.float64) / 255.0
    pal_lab = rgb_to_lab(pal_rgb01.reshape(1, -1, 3)).reshape(16, 3)
    pal_nlab = normalize_lab(pal_lab)

    # Denormalize output for comparison in same space
    indices = np.zeros((H, W), dtype=np.uint8)
    for y in range(H):
        for x in range(W):
            if not output_alpha[y, x]:
                indices[y, x] = 0
            else:
                d = np.sum((output_nlab[y, x] - pal_nlab) ** 2, axis=-1)
                indices[y, x] = np.argmin(d)

    return indices


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def parse_args(argv=None):
    p = argparse.ArgumentParser(
        description="Content-adaptive sprite downscaler for Genesis pixel art.",
    )
    p.add_argument("input", help="Input PNG path")
    p.add_argument("output", help="Output indexed PNG path")
    p.add_argument("--size", required=True,
                   help="Output size WxH (multiples of 8)")
    p.add_argument("--palette", required=True,
                   help="Indexed PNG to use as the 16-color palette")
    p.add_argument("--iters", type=int, default=50,
                   help="Max EM iterations (default 50, converges early)")
    p.add_argument("--bg-color",
                   help="Treat this R,G,B color as transparent (for RGB inputs)")
    p.add_argument("--seed", type=int, default=None,
                   help="RNG seed for reproducibility")
    p.add_argument("--verbose", action="store_true",
                   help="Print per-iteration convergence stats")
    return p.parse_args(argv)


def main(argv=None):
    args = parse_args(argv)

    # Parse size
    try:
        w_str, h_str = args.size.lower().split("x")
        out_w, out_h = int(w_str), int(h_str)
    except ValueError:
        print(f"Error: --size must be WxH, got '{args.size}'", file=sys.stderr)
        sys.exit(1)

    if out_w % 8 != 0 or out_h % 8 != 0:
        print("Error: output dimensions must be multiples of 8", file=sys.stderr)
        sys.exit(1)

    # Load palette
    palette_path = Path(args.palette)
    if not palette_path.exists():
        print(f"Error: palette file not found: {args.palette}", file=sys.stderr)
        sys.exit(1)
    ext_palette = load_palette(args.palette)

    # Parse bg color
    bg_color = None
    if args.bg_color:
        try:
            parts = [int(x.strip()) for x in args.bg_color.split(",")]
            if len(parts) != 3:
                raise ValueError
            bg_color = tuple(parts)
        except ValueError:
            print("Error: --bg-color must be R,G,B", file=sys.stderr)
            sys.exit(1)

    # Load input
    input_path = Path(args.input)
    if not input_path.exists():
        print(f"Error: input file not found: {args.input}", file=sys.stderr)
        sys.exit(1)

    print(f"Loading {args.input}...")
    nlab_image, alpha_mask = load_input(args.input, bg_color)
    in_h, in_w = nlab_image.shape[:2]

    if out_w >= in_w or out_h >= in_h:
        print(f"Error: output ({out_w}x{out_h}) must be smaller than input ({in_w}x{in_h})",
              file=sys.stderr)
        sys.exit(1)

    print(f"Input:  {in_w}x{in_h}")
    print(f"Output: {out_w}x{out_h}")
    print(f"Palette: {args.palette}")
    print(f"Max EM iterations: {args.iters}")

    # Run EM downscale
    print("Running EM downscale...")
    output_nlab, output_alpha = run_downscale(
        nlab_image, alpha_mask, out_h, out_w,
        n_iters=args.iters, verbose=args.verbose,
    )

    # Assign to provided palette
    print("Assigning to palette...")
    indices = assign_to_palette(output_nlab, output_alpha, ext_palette)

    # Save
    print(f"Saving {args.output}...")
    save_indexed_png(args.output, indices, ext_palette)

    # Summary
    n_opaque = output_alpha.sum()
    n_transparent = (~output_alpha).sum()
    colors_used = len(np.unique(indices[output_alpha])) if n_opaque > 0 else 0
    print(f"\nDone!")
    print(f"  Opaque pixels:      {n_opaque}")
    print(f"  Transparent pixels: {n_transparent}")
    print(f"  Colors used:        {colors_used} / 15")


if __name__ == "__main__":
    main()