diff --git a/parser/sqlfn.py b/parser/sqlfn.py new file mode 100644 index 0000000..0c0fcf1 --- /dev/null +++ b/parser/sqlfn.py @@ -0,0 +1,158 @@ +"""Attach the SQL-name map (@sqlfn / @sqlop) to the MEOS-API catalog. + +The catalog carries MEOS-C function names + C signatures, but bindings that +emit a SQL/UDF surface (MobilityDB SQL, MobilitySpark UDFs, MobilityDuck, …) +need the user-facing SQL name and operator. Both are machine-extractable from +the doxygen tag chain that already pervades the source: + + MEOS-C fn --@csqlfn #MobilityDB_C()--> MobilityDB-C wrapper + MobilityDB-C wrapper --@sqlfn sqlName() / @sqlop @p --> SQL name + op + +So: in meos/src `@csqlfn #Wrapper()` sits above the MEOS-C function (→ MEOS-C → +Wrapper); in mobilitydb/src `@sqlfn name()` + `@sqlop @p ` sit above +`Datum Wrapper(PG_FUNCTION_ARGS)` (→ Wrapper → name, op). Join on Wrapper. + +Adds per function (when the chain resolves): `sqlfn`, `sqlop`, `mdbC`. +""" +import re +from pathlib import Path + +# A @csqlfn tag carries one OR MORE #Wrapper() references — comma- or +# space-separated, and possibly continued across doxygen lines — because a single +# MEOS function can back several wrappers (the ever/always pair eDisjoint/aDisjoint +# share one ea_* function; the shift/scale/shift_scale trio share one C function). +# The tag value runs from @csqlfn up to the next doxygen tag or the comment close. +_CSQLFN = re.compile(r"@csqlfn\b") +_CSQLFN_REF = re.compile(r"#(\w+)\s*\(\)") +_CSQLFN_END = re.compile(r"@\w|\*/") +# After the doxygen close, the MEOS-C definition: an optional return-type line +# (no parens/braces/;/=), then `name(`. +_FNDEF = re.compile(r"\*/\s*\n(?:[^\n(){};=]+\n)?(\w+)\s*\(") +_SQLFN = re.compile(r"@sqlfn\s+(\w+)\s*\(\)") +_SQLOP = re.compile(r"@sqlop\s+@p\s+(\S+)") +_DATUM = re.compile(r"Datum\s+(\w+)\s*\(\s*PG_FUNCTION_ARGS") + + +def _meos_to_mdb(meos_src): + """MEOS-C function name -> ordered list of MobilityDB-C wrapper names (from + @csqlfn). One MEOS function can back more than one wrapper — the ever/always + pair eDisjoint/aDisjoint share a single ea_* function tagged + `@csqlfn #Edisjoint_…() #Adisjoint_…()` — so each @csqlfn carries one or more + #Wrapper() references; collect them all (mirrors _mdb_to_sql collecting every + @sqlfn rather than the first).""" + out = {} + for cf in Path(meos_src).rglob("*.c"): + text = cf.read_text(errors="ignore") + for m in _CSQLFN.finditer(text): + tail = text[m.end():] + end = _CSQLFN_END.search(tail) + value = tail[:end.start()] if end else tail + wrappers = _CSQLFN_REF.findall(value) + if not wrappers: + continue + fm = _FNDEF.search(text, m.end()) + if not fm: + continue + lst = out.setdefault(fm.group(1), []) + for w in wrappers: + if w not in lst: + lst.append(w) + return out + + +def _mdb_to_sql(mdb_src): + """MobilityDB-C wrapper name -> ordered list of (sqlfn, sqlop). + + A shared PG wrapper can carry more than one @sqlfn (e.g. Temporal_derivative + is exposed as both derivative() and speed()), so collect ALL of them rather + than the first — otherwise the mapped SQL name is order-dependent. + """ + out = {} + for cf in Path(mdb_src).rglob("*.c"): + text = cf.read_text(errors="ignore") + for m in _SQLFN.finditer(text): + sqlfn = m.group(1) + # @sqlop lives in the SAME doxygen block (before the closing */). + close = text.find("*/", m.end()) + block = text[m.start():close] if close != -1 else text[m.start():m.start() + 800] + op = _SQLOP.search(block) + dm = _DATUM.search(text, close if close != -1 else m.end()) + if dm: + entry = (sqlfn, op.group(1) if op else None) + lst = out.setdefault(dm.group(1), []) + if entry not in lst: + lst.append(entry) + return out + + +def attach_sqlfn_map(idl, meos_src, mdb_src): + m2d = _meos_to_mdb(meos_src) + d2s = _mdb_to_sql(mdb_src) + n = 0 + for f in idl["functions"]: + wrappers = m2d.get(f["name"]) + if not wrappers: + continue + # A MEOS function can back several wrappers (the ever/always pair), each + # carrying its own @sqlfn; collect the (sqlfn, sqlop) pairs across all of + # them in order, keeping the primary (first) wrapper for back-compat. + pairs = [] + for w in wrappers: + for entry in d2s.get(w, []): + if entry not in pairs: + pairs.append(entry) + if not pairs: + continue + f["mdbC"] = wrappers[0] + f["sqlfn"] = pairs[0][0] + if pairs[0][1]: + f["sqlop"] = pairs[0][1] + # Shared wrapper OR ever/always pair exposing >1 SQL name: record them all. + if len(pairs) > 1: + f["sqlfnAll"] = [s for s, _ in pairs] + if len(wrappers) > 1: + f["mdbCAll"] = wrappers + n += 1 + return idl, n + + +# MEOS-C ever/always spatial-relationship functions are named _...; their +# @csqlfn must point at the matching _... wrapper. A copy-paste @csqlfn in +# meos/src (e.g. eintersects_tgeo_geo tagged #Aintersects_tgeo_geo) silently flips the +# resolved @sqlfn from eX to aX — which then drops the real overload from the eX dispatch +# group and lets a wrong subtype backing be reached (a runtime "must be of type ..." error +# in the bindings). The parser is faithful, so guard the SOURCE here: flag any function +# whose name e/a prefix disagrees with its resolved @sqlfn e/a prefix. +_EA_FAMILY = re.compile( + r"^(e|a)(intersects|disjoint|contains|contained|covers|coveredby|touches|" + r"dwithin|within|equals|crosses|overlaps)_") + + +def lint_ea_sqlfn(idl): + """Return [(meos_c_name, sqlfn)] where the function's ever/always (e/a) name prefix + contradicts its resolved @sqlfn — a source @csqlfn mistag in meos/src.""" + bad = [] + for f in idl["functions"]: + sf = f.get("sqlfn") + m = _EA_FAMILY.match(f["name"]) + if sf and m and re.match(r"^[ea][A-Z]", sf) and sf[0] != m.group(1): + bad.append((f["name"], sf)) + return bad + + +def lint_sqlfn_case_collisions(idl): + """Return [(lower, [spelling, ...])] for @sqlfn names that collide + case-insensitively but differ in case (e.g. tDistance vs tdistance). + + PostgreSQL folds unquoted identifiers to lower case, so the two spell the + SAME SQL function and the clash is invisible in SQL / pg_regress. But the + binding name is taken case-SENSITIVELY, and case-insensitive engines (Spark + SQL, …) register every spelling under one UDF — so one silently shadows the + other. A canonical binding name must have exactly ONE spelling; surface a + casing straggler here before it reaches a binding.""" + by_lower = {} + for f in idl["functions"]: + for sf in [f.get("sqlfn"), *f.get("sqlfnAll", [])]: + if sf: + by_lower.setdefault(sf.lower(), set()).add(sf) + return sorted((lo, sorted(sp)) for lo, sp in by_lower.items() if len(sp) > 1) diff --git a/run.py b/run.py index 640dd9e..2d59ed3 100644 --- a/run.py +++ b/run.py @@ -1,3 +1,4 @@ +import os import sys import json from pathlib import Path @@ -5,6 +6,7 @@ from parser.parser import parse_all_headers, merge_meta from parser.portable import attach_portable_aliases from parser.typerecover import recover_collapsed_types +from parser.sqlfn import attach_sqlfn_map, lint_ea_sqlfn, lint_sqlfn_case_collisions HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include") @@ -40,6 +42,38 @@ def main(): file=sys.stderr) idl = attach_portable_aliases(idl, PORTABLE_PATH) + # 4. Attach the SQL-name map (@sqlfn/@sqlop) from the vendored source. + # The source root is overridable (MDB_SRC_ROOT) so a binding can point the + # @sqlfn/@ingroup extraction at the SAME pinned checkout as the headers, + # keeping the catalog reproducibly equivalent to that pin. + SRC_ROOT = Path(os.environ.get("MDB_SRC_ROOT", "./_mobilitydb")) + MEOS_SRC = SRC_ROOT / "meos" / "src" + MDB_SRC = SRC_ROOT / "mobilitydb" / "src" + if MEOS_SRC.exists() and MDB_SRC.exists(): + idl, nsql = attach_sqlfn_map(idl, MEOS_SRC, MDB_SRC) + print(f"[4/4] Attached {nsql} @sqlfn SQL names", file=sys.stderr) + # Guard: a copy-paste @csqlfn in meos/src can point an ever/always function at + # the opposite-prefix wrapper (eintersects_* tagged #Aintersects_*), flipping its + # SQL name and breaking the binding overload dispatch. The parser is faithful, so + # surface the SOURCE mistag here rather than ship a wrong catalog silently. + ea_bad = lint_ea_sqlfn(idl) + if ea_bad: + print(f" ⚠ {len(ea_bad)} @csqlfn e/a-prefix mistag(es) in meos/src " + f"(fix at source — wrong @sqlfn resolved):", file=sys.stderr) + for cname, sf in ea_bad: + print(f" {cname} -> @sqlfn {sf}", file=sys.stderr) + # Guard: @sqlfn names that differ only by case (e.g. tDistance vs tdistance) + # are the SAME SQL function (PostgreSQL folds the identifier) but DISTINCT + # binding names — a case-insensitive engine (Spark SQL) registers both under + # one UDF, so one silently shadows the other. Invisible in SQL; surface the + # casing straggler here, to be fixed at the MEOS-C @sqlfn source. + case_bad = lint_sqlfn_case_collisions(idl) + if case_bad: + print(f" ⚠ {len(case_bad)} @sqlfn case-collision(s) (pick ONE canonical " + f"spelling at the MEOS-C source — binding-breaking otherwise):", file=sys.stderr) + for _lo, spellings in case_bad: + print(f" {' vs '.join(spellings)}", file=sys.stderr) + idl_path = OUTPUT_DIR / "meos-idl.json" with open(idl_path, "w") as f: json.dump(idl, f, indent=2)