Source code for interpreter.intent

from __future__ import annotations

"""Intent helpers for the TDPy interpreter package.

This module is the intention layer for converting parsed text into solver-ready
pieces.

Responsibilities
----------------
The module intentionally stays narrow:

* Normalize equation strings from EES-like syntax into solver-friendly syntax.
* Parse constant assignments from ``given`` sections.
* Parse guess lines in several human-friendly formats.
* Extract identifiers conservatively for unknown inference.
* Resolve numeric constants through safe multi-pass evaluation.

Design notes
------------
This module does not parse full text files. Full text parsing belongs in
``parse.py``. This module provides primitives used by ``build_spec.py``.

Thermodynamic-property calls are handled carefully. The identifier inference
logic treats common CoolProp and Cantera function names as functions rather than
unknown variables. It also treats common fluid-spec tokens as symbolic text so
numeric constant resolution does not emit unnecessary warnings for inputs such
as ``fluid = Helium`` or ``spec = gri30.yaml|X=CH4:1``.
"""

import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple

from .numeric_eval import NumericEvalError, safe_eval_numeric, try_parse_float_or_quantity

# Reuse solver-safe preprocessing when available.
try:
    from equations.safe_eval import preprocess_expr  # type: ignore
except Exception:  # pragma: no cover

    def preprocess_expr(s: str) -> str:
        """Minimal fallback preprocessing used when equations.safe_eval is unavailable."""
        return s.replace("^", "**")


# ------------------------------ regex ------------------------------

# name [=|==|:=] rhs
_ASSIGN_RE = re.compile(r"^\s*([A-Za-z_]\w*)\s*(==|=|:=)\s*(.+?)\s*$")

# Guess forms such as "? x = 1", "guess: x = 1", and "init x = 1".
_GUESS_PREFIX_RE = re.compile(r"^\s*(\?|guess|init)\s*[:\s]+(.+)$", re.IGNORECASE)

# Inline guess form: "x ?= 1".
_GUESS_INLINE_RE = re.compile(r"^\s*([A-Za-z_]\w*)\s*\?=\s*(.+?)\s*$")

# Defensive directive filters.
_REPORT_LINE_RE = re.compile(r"^\s*report\s*:\s*(.+?)\s*$", re.IGNORECASE)
_SOLVE_LINE_RE = re.compile(r"^\s*(solve|solver)\s*:\s*(.+?)\s*$", re.IGNORECASE)

# Optimizer directive-like lines. These are not equations; they are interpreted
# by build_spec.py when optimization mode is enabled. The RHS is optional so
# section-header forms such as "objective:" do not become equations.
_OBJECTIVE_LINE_RE = re.compile(r"^\s*(objective|minimize|maximize)\s*:\s*(.*?)\s*$", re.IGNORECASE)
_CONSTRAINTS_LINE_RE = re.compile(r"^\s*(constraints|constraint)\s*:\s*(.*?)\s*$", re.IGNORECASE)
_DESIGNVARS_LINE_RE = re.compile(
    r"^\s*(design_vars|designvars|design_variables|designvariables)\s*:\s*(.*?)\s*$",
    re.IGNORECASE,
)
_BOUNDS_LINE_RE = re.compile(r"^\s*(bounds|bound)\s*:\s*(.*?)\s*$", re.IGNORECASE)


def _is_directive_line(line: str) -> bool:
    """Return whether a line is a known directive rather than solver content."""
    s = _strip_inline_comment(line).strip()
    if not s:
        return False
    if _REPORT_LINE_RE.match(s) or _SOLVE_LINE_RE.match(s):
        return True
    if (
        _OBJECTIVE_LINE_RE.match(s)
        or _CONSTRAINTS_LINE_RE.match(s)
        or _DESIGNVARS_LINE_RE.match(s)
        or _BOUNDS_LINE_RE.match(s)
    ):
        return True
    return False


# Lightweight function-call detector for name extraction heuristics.
_CALL_RE = re.compile(r"\b([A-Za-z_]\w*)\s*\(")


# ------------------------------ vocab ------------------------------

_BUILTIN_CONSTS: Set[str] = {"pi", "e"}

# Function names that should not be treated as variables. Keep aligned with the
# equations.safe_eval allowlist and interpreter.build_spec.
_MATH_FUNC_NAMES: Set[str] = {
    # core
    "abs",
    "min",
    "max",
    "pow",
    "clamp",
    # exp/logs
    "sqrt",
    "exp",
    "log",
    "ln",
    "log10",
    "log2",
    # trig
    "sin",
    "cos",
    "tan",
    "asin",
    "acos",
    "atan",
    "atan2",
    # hyperbolic
    "sinh",
    "cosh",
    "tanh",
    # rounding
    "floor",
    "ceil",
    # misc
    "hypot",
    "radians",
    "degrees",
    # thermo
    "PropsSI",
    "PhaseSI",
    "HAPropsSI",
    "CTPropsSI",
    "CTPropsMulti",
    "CTBatchProps",
    "ctprops_si",
    "ctprops_multi",
    "batch_ctprops",
    "cantera_available",
    "ctprops_cache_info",
    "clear_ctprops_caches",
    "ASPropsSI",
    "ASPropsMulti",
    "ASBatchProps",
    "as_props_si",
    "as_props_multi",
    "batch_as_props",
    "abstractstate_available",
    "FugacitySI",
    "FugacityCoeffSI",
    "LnFugacityCoeffSI",
    "ChemicalPotentialSI",
    "LiBrPropsSI",
    "LiBrH2OPropsSI",
    "LiBrPropsMulti",
    "LiBrBatchProps",
    "librh2o_props_si",
    "librh2o_props_multi",
    "batch_librh2o_props",
    "NH3H2O",
    "NH3H2O_STATE",
    "NH3H2O_TPX",
    "NH3H2O_STATE_TPX",
    "NH3H2OPropsSI",
    "NH3H2OPropsMulti",
    "NH3H2OBatchProps",
    "nh3h2o_available",
    "state_tpx",
    "prop_tpx",
    "props_multi_tpx",
    "batch_prop_tpx",
    # special
    "erf",
    "erfc",
    "gamma",
    "lgamma",
}

# Words that users might type that should never become unknowns.
_RESERVED_WORDS: Set[str] = {
    "title",
    "given",
    "givens",
    "constants",
    "const",
    "params",
    "parameters",
    "guess",
    "guesses",
    "init",
    "inits",
    "variables",
    "vars",
    "equations",
    "eqs",
    "report",
    "output",
    "solve",
    "solver",
    "objective",
    "minimize",
    "maximize",
    "constraints",
    "constraint",
    "design_vars",
    "designvars",
    "design_variables",
    "designvariables",
    "bounds",
    "bound",
    "note",
    "notes",
    "units",
}

# Lowercased views for case-insensitive filtering.
_BUILTIN_CONSTS_LC = {s.lower() for s in _BUILTIN_CONSTS}
_RESERVED_WORDS_LC = {s.lower() for s in _RESERVED_WORDS}
_MATH_FUNC_NAMES_LC = {s.lower() for s in _MATH_FUNC_NAMES}


# ------------------------------ helpers ------------------------------

def _strip_string_literals(expr: str) -> str:
    """Remove content inside quoted strings before identifier extraction."""
    out: list[str] = []
    q: str | None = None
    esc = False

    for ch in expr:
        if q is None:
            if ch in ("'", '"'):
                q = ch
                out.append(" ")
            else:
                out.append(ch)
        else:
            if esc:
                esc = False
                continue
            if ch == "\\":
                esc = True
                continue
            if ch == q:
                q = None
                out.append(" ")
            else:
                continue

    return "".join(out)


def _strip_inline_comment(line: str) -> str:
    """Strip ``#`` and ``//`` inline comments while respecting quoted strings."""
    s = line
    out: list[str] = []
    q: str | None = None
    esc = False
    i = 0
    n = len(s)

    while i < n:
        ch = s[i]

        if q is None:
            if ch in ("'", '"'):
                q = ch
                out.append(ch)
                i += 1
                continue

            if ch == "#":
                break
            if ch == "/" and i + 1 < n and s[i + 1] == "/":
                break

            out.append(ch)
            i += 1
            continue

        # inside quotes
        if esc:
            esc = False
            out.append(ch)
            i += 1
            continue
        if ch == "\\":
            esc = True
            out.append(ch)
            i += 1
            continue
        if ch == q:
            q = None
            out.append(ch)
            i += 1
            continue

        out.append(ch)
        i += 1

    return "".join(out)


# This token regex is used only to suppress numeric-eval warnings for
# fluid/mechanism specs. It does not affect solver parsing.
_SYMBOL_TOKEN_RE = re.compile(r"^[A-Za-z0-9_:\-./|=,+\[\]\(\)]+$")


def _is_symbolic_constant_rhs(rhs: str) -> bool:
    """Return whether an RHS looks like a symbolic text constant."""
    r = rhs.strip()
    if not r:
        return False

    if (len(r) >= 2) and ((r[0] == r[-1]) and r[0] in ("'", '"')):
        return True

    if any(ch.isspace() for ch in r):
        return False

    if any(op in r for op in ("+", "*", "/", "^")):
        if "*" in r or "/" in r or "^" in r:
            return False

    if not _SYMBOL_TOKEN_RE.fullmatch(r):
        return False

    return any(ch.isalpha() for ch in r)


# ------------------------------ data model ------------------------------


[docs]
@dataclass
class IntentDraft:
    """Optional richer structure for future one-shot text interpretation.

    The current interpreter path uses ``build_spec.py`` as the main builder.
    This dataclass is retained as a stable intermediate shape for future
    extensions and for callers that may already import it.
    """

    title: Optional[str]
    equations: List[str]
    constants_expr: Dict[str, str]
    constants_val: Dict[str, float]
    guesses: Dict[str, float]
    report: List[str]
    solve_overrides: Dict[str, Any]
    warnings: List[str]
    reserved_words: Set[str] = field(default_factory=lambda: set(_RESERVED_WORDS) | set(_BUILTIN_CONSTS))
    func_names: Set[str] = field(default_factory=lambda: set(_MATH_FUNC_NAMES))



# ------------------------------ parsing primitives ------------------------------


[docs]
def parse_guess_line(line: str, *, enable_units: bool = True) -> Optional[Tuple[str, float]]:
    """Parse a human-friendly guess line.

    Supported forms include ``"? x = 1.2"``, ``"guess: x = 1.2"``,
    ``"init x = 1.2"``, ``"x ?= 1.2"``, and ``"x = 1.2"`` when the right-hand
    side is numeric or a parseable quantity.

    Returns
    -------
    tuple[str, float] | None
        ``(name, value)`` when parsing succeeds; otherwise ``None``.
    """
    s = _strip_inline_comment(line).strip()
    if not s:
        return None

    if _is_directive_line(s):
        return None

    m_inline = _GUESS_INLINE_RE.match(s)
    if m_inline:
        name = m_inline.group(1)
        val = m_inline.group(2).strip()
        fv = try_parse_float_or_quantity(val, enable_units=enable_units)
        if fv is None:
            return None
        return name, float(fv)

    m_pref = _GUESS_PREFIX_RE.match(s)
    if m_pref:
        rest = m_pref.group(2).strip()
        m = _ASSIGN_RE.match(rest)
        if not m:
            parts = rest.split()
            if len(parts) >= 2:
                name = parts[0]
                val = " ".join(parts[1:])
                fv = try_parse_float_or_quantity(val, enable_units=enable_units)
                if fv is not None and re.fullmatch(r"[A-Za-z_]\w*", name or ""):
                    return name, float(fv)
            return None

        name = m.group(1)
        rhs = m.group(3).strip()
        fv = try_parse_float_or_quantity(rhs, enable_units=enable_units)
        if fv is None:
            return None
        return name, float(fv)

    m = _ASSIGN_RE.match(s)
    if m:
        name = m.group(1)
        rhs = m.group(3).strip()
        fv = try_parse_float_or_quantity(rhs, enable_units=enable_units)
        if fv is None:
            return None
        return name, float(fv)

    return None




[docs]
def parse_constant_assignment(line: str) -> Optional[Tuple[str, str]]:
    """Parse a constant assignment and return the name and RHS expression.

    Accepted forms include ``"g = 9.81"`` and ``"A := pi*r^2"``. The right-hand
    side is returned after solver-safe preprocessing.

    The function does not decide whether the right-hand side is numeric or
    symbolic. That decision is made by the builder layer.
    """
    s = _strip_inline_comment(line).strip()
    if not s:
        return None

    if _is_directive_line(s):
        return None

    m = _ASSIGN_RE.match(s)
    if not m:
        return None

    name = m.group(1)
    rhs = preprocess_expr(m.group(3).strip())
    return name, rhs




[docs]
def looks_like_equation(line: str) -> bool:
    """Return whether a line should stay in the equation stream.

    Empty lines, comment-only lines, ``report`` directives, and ``solve``
    directives return ``False``. Other lines return ``True``.

    Optimizer directives such as ``maximize:``, ``bounds:``, and
    ``design_vars:`` are allowed through the equation stream so ``build_spec.py``
    can peel them off and construct optimization specs.
    """
    s = _strip_inline_comment(line).strip()
    if not s:
        return False
    if _REPORT_LINE_RE.match(s) or _SOLVE_LINE_RE.match(s):
        return False
    return True




[docs]
def normalize_equation(line: str) -> str:
    """Normalize common user syntax for solver consumption."""
    s = _strip_inline_comment(line).strip()
    if not s:
        return ""

    s = preprocess_expr(s)

    if ":=" in s:
        s = s.replace(":=", "=")
    if "==" in s:
        s = s.replace("==", "=")

    return s.strip()



# ------------------------------ identifier extraction ------------------------------


[docs]
def extract_names_fallback(expr: str) -> Set[str]:
    """Extract likely variable names from an expression.

    The extractor strips comments and quoted string literals before scanning.
    It filters built-in constants, reserved directive words, known function
    names, and dunder-style names.
    """
    expr0 = _strip_inline_comment(expr)
    expr2 = _strip_string_literals(expr0)

    names = set(re.findall(r"[A-Za-z_]\w*", expr2))

    names = {
        n
        for n in names
        if n.lower() not in _BUILTIN_CONSTS_LC and n.lower() not in _RESERVED_WORDS_LC
    }

    for fn in _CALL_RE.findall(expr2):
        if fn.lower() in _MATH_FUNC_NAMES_LC:
            names.discard(fn)

    names = {
        n
        for n in names
        if n.lower() not in _MATH_FUNC_NAMES_LC and not n.startswith("__")
    }
    return names



# ------------------------------ constant resolution ------------------------------


[docs]
def resolve_constants(
    constants_expr: Mapping[str, str],
    *,
    enable_units: bool = True,
) -> Tuple[Dict[str, float], Dict[str, str], List[str]]:
    """Resolve numeric constants with a multi-pass strategy.

    Resolution order is:

    1. Direct numeric or unit-aware parsing.
    2. Safe numeric evaluation using constants that have already been resolved.
    3. Preservation of unresolved expressions for the caller.

    Returns
    -------
    tuple[dict[str, float], dict[str, str], list[str]]
        Resolved numeric constants, unresolved expression constants, and
        warning messages.
    """
    warnings: List[str] = []
    resolved: Dict[str, float] = {}
    unresolved: Dict[str, str] = {k: preprocess_expr(v.strip()) for k, v in dict(constants_expr).items()}

    for k, rhs in list(unresolved.items()):
        fv = try_parse_float_or_quantity(rhs, enable_units=enable_units)
        if fv is not None:
            resolved[k] = float(fv)
            unresolved.pop(k, None)

    progress = True
    it = 0
    while progress and unresolved and it < 50:
        it += 1
        progress = False

        for k, rhs in list(unresolved.items()):
            if _is_symbolic_constant_rhs(rhs):
                continue
            try:
                v = safe_eval_numeric(rhs, names=resolved)
            except NumericEvalError:
                continue
            except Exception as e:
                warnings.append(f"Failed to evaluate constant {k}={rhs!r}: {e}")
                continue

            resolved[k] = float(v)
            unresolved.pop(k, None)
            progress = True

    for k, rhs in unresolved.items():
        if _is_symbolic_constant_rhs(rhs):
            continue
        refs = extract_names_fallback(rhs)
        refs = {r for r in refs if r not in resolved and r.lower() not in _BUILTIN_CONSTS_LC}
        if refs:
            warnings.append(f"Constant {k!r} could not be resolved; references unknown names: {sorted(refs)}")

    return resolved, unresolved, warnings