# -*- coding: utf-8 -*- """Small LLM output helpers.""" import json import re from typing import Any, Dict, Optional _FENCED_JSON_RE = re.compile(r"```(?:json)?\s*([\s\S]*?)\s*```", re.IGNORECASE) # Regex fallback: extract "answer" value from a JSON-like structure. # Handles both "answer": "..." (double-quoted) and multi-line values. _ANSWER_FIELD_RE = re.compile( r'"answer"\s*:\s*"((?:[^"\\]|\\.)*)"', re.DOTALL, ) def extract_json_object(text: str) -> Dict[str, Any]: """Extract a JSON object from a model response.""" if not text: return {} stripped = text.strip() fenced_match = _FENCED_JSON_RE.search(stripped) if fenced_match: stripped = fenced_match.group(1).strip() try: value = json.loads(stripped) return value if isinstance(value, dict) else {} except json.JSONDecodeError: pass start = stripped.find("{") end = stripped.rfind("}") if start >= 0 and end > start: fragment = stripped[start:end + 1] try: value = json.loads(fragment) return value if isinstance(value, dict) else {} except json.JSONDecodeError: # Retry with control characters escaped (common when model # emits literal newlines/tabs inside string values). repaired = _repair_control_chars(fragment) if repaired != fragment: try: value = json.loads(repaired) return value if isinstance(value, dict) else {} except json.JSONDecodeError: pass return {} def extract_answer_field(text: str) -> Optional[str]: """Best-effort extraction of the "answer" field from a raw LLM response. Used as a fallback when ``extract_json_object`` fails to parse the full JSON (e.g. due to unescaped control characters in streamed output). """ if not text: return None match = _ANSWER_FIELD_RE.search(text) if not match: return None raw_value = match.group(1) # Unescape standard JSON escape sequences. try: return json.loads(f'"{raw_value}"') except json.JSONDecodeError: return raw_value def _repair_control_chars(s: str) -> str: """Replace literal control chars inside JSON string values. Models sometimes emit raw newlines / tabs inside string literals, which ``json.loads`` rejects. This replaces them with proper escapes while leaving the surrounding JSON structure intact. """ # Only replace control characters that appear between quotes. # A simple approach: replace all bare \n/\r/\t with escaped versions, # but skip already-escaped sequences (preceded by backslash). result = [] i = 0 in_string = False while i < len(s): c = s[i] if c == '"' and (i == 0 or s[i - 1] != "\\"): in_string = not in_string result.append(c) elif in_string and c == "\n": result.append("\\n") elif in_string and c == "\r": result.append("\\r") elif in_string and c == "\t": result.append("\\t") else: result.append(c) i += 1 return "".join(result) def compact_json(value: Any) -> str: return json.dumps(value, ensure_ascii=False, indent=2)