| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- # -*- coding: utf-8 -*-
- """Small LLM output helpers."""
- import json
- import re
- from typing import Any, Dict, Optional
- _FENCED_JSON_RE = re.compile(r"```(?:json)?\s*([\s\S]*?)\s*```", re.IGNORECASE)
- # Regex fallback: extract "answer" value from a JSON-like structure.
- # Handles both "answer": "..." (double-quoted) and multi-line values.
- _ANSWER_FIELD_RE = re.compile(
- r'"answer"\s*:\s*"((?:[^"\\]|\\.)*)"',
- re.DOTALL,
- )
- def extract_json_object(text: str) -> Dict[str, Any]:
- """Extract a JSON object from a model response."""
- if not text:
- return {}
- stripped = text.strip()
- fenced_match = _FENCED_JSON_RE.search(stripped)
- if fenced_match:
- stripped = fenced_match.group(1).strip()
- try:
- value = json.loads(stripped)
- return value if isinstance(value, dict) else {}
- except json.JSONDecodeError:
- pass
- start = stripped.find("{")
- end = stripped.rfind("}")
- if start >= 0 and end > start:
- fragment = stripped[start:end + 1]
- try:
- value = json.loads(fragment)
- return value if isinstance(value, dict) else {}
- except json.JSONDecodeError:
- # Retry with control characters escaped (common when model
- # emits literal newlines/tabs inside string values).
- repaired = _repair_control_chars(fragment)
- if repaired != fragment:
- try:
- value = json.loads(repaired)
- return value if isinstance(value, dict) else {}
- except json.JSONDecodeError:
- pass
- return {}
- def extract_answer_field(text: str) -> Optional[str]:
- """Best-effort extraction of the "answer" field from a raw LLM response.
- Used as a fallback when ``extract_json_object`` fails to parse the full
- JSON (e.g. due to unescaped control characters in streamed output).
- """
- if not text:
- return None
- match = _ANSWER_FIELD_RE.search(text)
- if not match:
- return None
- raw_value = match.group(1)
- # Unescape standard JSON escape sequences.
- try:
- return json.loads(f'"{raw_value}"')
- except json.JSONDecodeError:
- return raw_value
- def _repair_control_chars(s: str) -> str:
- """Replace literal control chars inside JSON string values.
- Models sometimes emit raw newlines / tabs inside string literals,
- which ``json.loads`` rejects. This replaces them with proper escapes
- while leaving the surrounding JSON structure intact.
- """
- # Only replace control characters that appear between quotes.
- # A simple approach: replace all bare \n/\r/\t with escaped versions,
- # but skip already-escaped sequences (preceded by backslash).
- result = []
- i = 0
- in_string = False
- while i < len(s):
- c = s[i]
- if c == '"' and (i == 0 or s[i - 1] != "\\"):
- in_string = not in_string
- result.append(c)
- elif in_string and c == "\n":
- result.append("\\n")
- elif in_string and c == "\r":
- result.append("\\r")
- elif in_string and c == "\t":
- result.append("\\t")
- else:
- result.append(c)
- i += 1
- return "".join(result)
- def compact_json(value: Any) -> str:
- return json.dumps(value, ensure_ascii=False, indent=2)
|