From a8779b4e160bd7948ab0caa038eb274259931cdf Mon Sep 17 00:00:00 2001 From: Esteban Zimanyi Date: Fri, 12 Jun 2026 21:02:09 +0200 Subject: [PATCH 1/4] Attach the @sqlfn SQL-name map to the catalog Follow each function's @csqlfn -> wrapper @sqlfn -> SQL name chain and attach the resulting SQL name to the catalog. The vendored-source root is overridable via MDB_SRC_ROOT so the @sqlfn (and @ingroup) extraction can be pointed at the same pinned MobilityDB checkout as the headers, keeping the generated catalog reproducibly equivalent to that pin. --- parser/sqlfn.py | 86 +++++++++++++++++++++++++++++++++++++++++++++++++ run.py | 13 ++++++++ 2 files changed, 99 insertions(+) create mode 100644 parser/sqlfn.py diff --git a/parser/sqlfn.py b/parser/sqlfn.py new file mode 100644 index 0000000..4d8b2a9 --- /dev/null +++ b/parser/sqlfn.py @@ -0,0 +1,86 @@ +"""Attach the SQL-name map (@sqlfn / @sqlop) to the MEOS-API catalog. + +The catalog carries MEOS-C function names + C signatures, but bindings that +emit a SQL/UDF surface (MobilityDB SQL, MobilitySpark UDFs, MobilityDuck, …) +need the user-facing SQL name and operator. Both are machine-extractable from +the doxygen tag chain that already pervades the source: + + MEOS-C fn --@csqlfn #MobilityDB_C()--> MobilityDB-C wrapper + MobilityDB-C wrapper --@sqlfn sqlName() / @sqlop @p --> SQL name + op + +So: in meos/src `@csqlfn #Wrapper()` sits above the MEOS-C function (→ MEOS-C → +Wrapper); in mobilitydb/src `@sqlfn name()` + `@sqlop @p ` sit above +`Datum Wrapper(PG_FUNCTION_ARGS)` (→ Wrapper → name, op). Join on Wrapper. + +Adds per function (when the chain resolves): `sqlfn`, `sqlop`, `mdbC`. +""" +import re +from pathlib import Path + +_CSQLFN = re.compile(r"@csqlfn\s+#(\w+)\s*\(\)") +# After the doxygen close, the MEOS-C definition: an optional return-type line +# (no parens/braces/;/=), then `name(`. +_FNDEF = re.compile(r"\*/\s*\n(?:[^\n(){};=]+\n)?(\w+)\s*\(") +_SQLFN = re.compile(r"@sqlfn\s+(\w+)\s*\(\)") +_SQLOP = re.compile(r"@sqlop\s+@p\s+(\S+)") +_DATUM = re.compile(r"Datum\s+(\w+)\s*\(\s*PG_FUNCTION_ARGS") + + +def _meos_to_mdb(meos_src): + """MEOS-C function name -> MobilityDB-C wrapper name (from @csqlfn).""" + out = {} + for cf in Path(meos_src).rglob("*.c"): + text = cf.read_text(errors="ignore") + for m in _CSQLFN.finditer(text): + mdb_c = m.group(1) + fm = _FNDEF.search(text, m.end()) + if fm: + out.setdefault(fm.group(1), mdb_c) + return out + + +def _mdb_to_sql(mdb_src): + """MobilityDB-C wrapper name -> ordered list of (sqlfn, sqlop). + + A shared PG wrapper can carry more than one @sqlfn (e.g. Temporal_derivative + is exposed as both derivative() and speed()), so collect ALL of them rather + than the first — otherwise the mapped SQL name is order-dependent. + """ + out = {} + for cf in Path(mdb_src).rglob("*.c"): + text = cf.read_text(errors="ignore") + for m in _SQLFN.finditer(text): + sqlfn = m.group(1) + # @sqlop lives in the SAME doxygen block (before the closing */). + close = text.find("*/", m.end()) + block = text[m.start():close] if close != -1 else text[m.start():m.start() + 800] + op = _SQLOP.search(block) + dm = _DATUM.search(text, close if close != -1 else m.end()) + if dm: + entry = (sqlfn, op.group(1) if op else None) + lst = out.setdefault(dm.group(1), []) + if entry not in lst: + lst.append(entry) + return out + + +def attach_sqlfn_map(idl, meos_src, mdb_src): + m2d = _meos_to_mdb(meos_src) + d2s = _mdb_to_sql(mdb_src) + n = 0 + for f in idl["functions"]: + mdb_c = m2d.get(f["name"]) + if not mdb_c: + continue + lst = d2s.get(mdb_c) + if not lst: + continue + f["mdbC"] = mdb_c + f["sqlfn"] = lst[0][0] + if lst[0][1]: + f["sqlop"] = lst[0][1] + # Shared wrapper exposing >1 SQL name: record them all (binding picks). + if len(lst) > 1: + f["sqlfnAll"] = [s for s, _ in lst] + n += 1 + return idl, n diff --git a/run.py b/run.py index 640dd9e..7e0da7f 100644 --- a/run.py +++ b/run.py @@ -1,3 +1,4 @@ +import os import sys import json from pathlib import Path @@ -5,6 +6,7 @@ from parser.parser import parse_all_headers, merge_meta from parser.portable import attach_portable_aliases from parser.typerecover import recover_collapsed_types +from parser.sqlfn import attach_sqlfn_map HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include") @@ -40,6 +42,17 @@ def main(): file=sys.stderr) idl = attach_portable_aliases(idl, PORTABLE_PATH) + # 4. Attach the SQL-name map (@sqlfn/@sqlop) from the vendored source. + # The source root is overridable (MDB_SRC_ROOT) so a binding can point the + # @sqlfn/@ingroup extraction at the SAME pinned checkout as the headers, + # keeping the catalog reproducibly equivalent to that pin. + SRC_ROOT = Path(os.environ.get("MDB_SRC_ROOT", "./_mobilitydb")) + MEOS_SRC = SRC_ROOT / "meos" / "src" + MDB_SRC = SRC_ROOT / "mobilitydb" / "src" + if MEOS_SRC.exists() and MDB_SRC.exists(): + idl, nsql = attach_sqlfn_map(idl, MEOS_SRC, MDB_SRC) + print(f"[4/4] Attached {nsql} @sqlfn SQL names", file=sys.stderr) + idl_path = OUTPUT_DIR / "meos-idl.json" with open(idl_path, "w") as f: json.dump(idl, f, indent=2) From 594b1085590f2d86d668c87dc1a9686aa1b3ff26 Mon Sep 17 00:00:00 2001 From: Esteban Zimanyi Date: Mon, 15 Jun 2026 23:30:26 +0200 Subject: [PATCH 2/4] Guard against ever/always @csqlfn prefix mistags in meos/src MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A copy-paste @csqlfn in meos/src can point an ever/always spatial-relationship function (named _...) at the opposite-prefix MobilityDB-C wrapper — e.g. eintersects_tgeo_geo is tagged `@csqlfn #Aintersects_tgeo_geo`, so the chain resolves its @sqlfn to aIntersects instead of eIntersects. That silently drops the real (tgeo,geo) overload from the eIntersects group in a binding's overload dispatcher, leaving only a wrong-subtype backing reachable — which then raises a runtime "The temporal value must be of type tcbuffer" on a tgeompoint (observed in the MobilitySpark BerlinMOD bench, q17). The parser is faithful — it propagates whatever the source tags say — so this is a source defect to fix in meos/src, not a parser bug. Add lint_ea_sqlfn() and report the mismatches at catalog-gen so they surface loudly instead of shipping a wrong @sqlfn silently. Flags 5 live mistags (eintersects_tgeo_geo, etouches_tpoint_geo/_tcbuffer_geo/_tcbuffer_cbuffer, econtains_geo_trgeo); relayed to the source maintainers to correct the @csqlfn tags. --- parser/sqlfn.py | 24 ++++++++++++++++++++++++ run.py | 12 +++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/parser/sqlfn.py b/parser/sqlfn.py index 4d8b2a9..4d61136 100644 --- a/parser/sqlfn.py +++ b/parser/sqlfn.py @@ -84,3 +84,27 @@ def attach_sqlfn_map(idl, meos_src, mdb_src): f["sqlfnAll"] = [s for s, _ in lst] n += 1 return idl, n + + +# MEOS-C ever/always spatial-relationship functions are named _...; their +# @csqlfn must point at the matching _... wrapper. A copy-paste @csqlfn in +# meos/src (e.g. eintersects_tgeo_geo tagged #Aintersects_tgeo_geo) silently flips the +# resolved @sqlfn from eX to aX — which then drops the real overload from the eX dispatch +# group and lets a wrong subtype backing be reached (a runtime "must be of type ..." error +# in the bindings). The parser is faithful, so guard the SOURCE here: flag any function +# whose name e/a prefix disagrees with its resolved @sqlfn e/a prefix. +_EA_FAMILY = re.compile( + r"^(e|a)(intersects|disjoint|contains|contained|covers|coveredby|touches|" + r"dwithin|within|equals|crosses|overlaps)_") + + +def lint_ea_sqlfn(idl): + """Return [(meos_c_name, sqlfn)] where the function's ever/always (e/a) name prefix + contradicts its resolved @sqlfn — a source @csqlfn mistag in meos/src.""" + bad = [] + for f in idl["functions"]: + sf = f.get("sqlfn") + m = _EA_FAMILY.match(f["name"]) + if sf and m and re.match(r"^[ea][A-Z]", sf) and sf[0] != m.group(1): + bad.append((f["name"], sf)) + return bad diff --git a/run.py b/run.py index 7e0da7f..3812565 100644 --- a/run.py +++ b/run.py @@ -6,7 +6,7 @@ from parser.parser import parse_all_headers, merge_meta from parser.portable import attach_portable_aliases from parser.typerecover import recover_collapsed_types -from parser.sqlfn import attach_sqlfn_map +from parser.sqlfn import attach_sqlfn_map, lint_ea_sqlfn HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include") @@ -52,6 +52,16 @@ def main(): if MEOS_SRC.exists() and MDB_SRC.exists(): idl, nsql = attach_sqlfn_map(idl, MEOS_SRC, MDB_SRC) print(f"[4/4] Attached {nsql} @sqlfn SQL names", file=sys.stderr) + # Guard: a copy-paste @csqlfn in meos/src can point an ever/always function at + # the opposite-prefix wrapper (eintersects_* tagged #Aintersects_*), flipping its + # SQL name and breaking the binding overload dispatch. The parser is faithful, so + # surface the SOURCE mistag here rather than ship a wrong catalog silently. + ea_bad = lint_ea_sqlfn(idl) + if ea_bad: + print(f" ⚠ {len(ea_bad)} @csqlfn e/a-prefix mistag(es) in meos/src " + f"(fix at source — wrong @sqlfn resolved):", file=sys.stderr) + for cname, sf in ea_bad: + print(f" {cname} -> @sqlfn {sf}", file=sys.stderr) idl_path = OUTPUT_DIR / "meos-idl.json" with open(idl_path, "w") as f: From 733a6aeab48dd75e14ecc6d31ae2eb73745e1125 Mon Sep 17 00:00:00 2001 From: Esteban Zimanyi Date: Sun, 21 Jun 2026 21:18:28 +0200 Subject: [PATCH 3/4] Parse every #Wrapper() reference in a @csqlfn tag A single MEOS function can back more than one MobilityDB-C wrapper: the ever/always pair (eDisjoint/aDisjoint) shares one ea_* function, and the shift/scale/shift_scale trio shares one C function. Such functions carry one @csqlfn with comma- or space-separated #Wrapper() references that may continue across doxygen lines. _meos_to_mdb reads the whole tag value and returns every wrapper, and attach_sqlfn_map records all the SQL names a function exposes (sqlfn/sqlfnAll, mdbC/mdbCAll). --- parser/sqlfn.py | 62 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/parser/sqlfn.py b/parser/sqlfn.py index 4d61136..6391df2 100644 --- a/parser/sqlfn.py +++ b/parser/sqlfn.py @@ -17,7 +17,14 @@ import re from pathlib import Path -_CSQLFN = re.compile(r"@csqlfn\s+#(\w+)\s*\(\)") +# A @csqlfn tag carries one OR MORE #Wrapper() references — comma- or +# space-separated, and possibly continued across doxygen lines — because a single +# MEOS function can back several wrappers (the ever/always pair eDisjoint/aDisjoint +# share one ea_* function; the shift/scale/shift_scale trio share one C function). +# The tag value runs from @csqlfn up to the next doxygen tag or the comment close. +_CSQLFN = re.compile(r"@csqlfn\b") +_CSQLFN_REF = re.compile(r"#(\w+)\s*\(\)") +_CSQLFN_END = re.compile(r"@\w|\*/") # After the doxygen close, the MEOS-C definition: an optional return-type line # (no parens/braces/;/=), then `name(`. _FNDEF = re.compile(r"\*/\s*\n(?:[^\n(){};=]+\n)?(\w+)\s*\(") @@ -27,15 +34,29 @@ def _meos_to_mdb(meos_src): - """MEOS-C function name -> MobilityDB-C wrapper name (from @csqlfn).""" + """MEOS-C function name -> ordered list of MobilityDB-C wrapper names (from + @csqlfn). One MEOS function can back more than one wrapper — the ever/always + pair eDisjoint/aDisjoint share a single ea_* function tagged + `@csqlfn #Edisjoint_…() #Adisjoint_…()` — so each @csqlfn carries one or more + #Wrapper() references; collect them all (mirrors _mdb_to_sql collecting every + @sqlfn rather than the first).""" out = {} for cf in Path(meos_src).rglob("*.c"): text = cf.read_text(errors="ignore") for m in _CSQLFN.finditer(text): - mdb_c = m.group(1) + tail = text[m.end():] + end = _CSQLFN_END.search(tail) + value = tail[:end.start()] if end else tail + wrappers = _CSQLFN_REF.findall(value) + if not wrappers: + continue fm = _FNDEF.search(text, m.end()) - if fm: - out.setdefault(fm.group(1), mdb_c) + if not fm: + continue + lst = out.setdefault(fm.group(1), []) + for w in wrappers: + if w not in lst: + lst.append(w) return out @@ -69,19 +90,28 @@ def attach_sqlfn_map(idl, meos_src, mdb_src): d2s = _mdb_to_sql(mdb_src) n = 0 for f in idl["functions"]: - mdb_c = m2d.get(f["name"]) - if not mdb_c: + wrappers = m2d.get(f["name"]) + if not wrappers: continue - lst = d2s.get(mdb_c) - if not lst: + # A MEOS function can back several wrappers (the ever/always pair), each + # carrying its own @sqlfn; collect the (sqlfn, sqlop) pairs across all of + # them in order, keeping the primary (first) wrapper for back-compat. + pairs = [] + for w in wrappers: + for entry in d2s.get(w, []): + if entry not in pairs: + pairs.append(entry) + if not pairs: continue - f["mdbC"] = mdb_c - f["sqlfn"] = lst[0][0] - if lst[0][1]: - f["sqlop"] = lst[0][1] - # Shared wrapper exposing >1 SQL name: record them all (binding picks). - if len(lst) > 1: - f["sqlfnAll"] = [s for s, _ in lst] + f["mdbC"] = wrappers[0] + f["sqlfn"] = pairs[0][0] + if pairs[0][1]: + f["sqlop"] = pairs[0][1] + # Shared wrapper OR ever/always pair exposing >1 SQL name: record them all. + if len(pairs) > 1: + f["sqlfnAll"] = [s for s, _ in pairs] + if len(wrappers) > 1: + f["mdbCAll"] = wrappers n += 1 return idl, n From 6566a15992177a1c0175bd3c0a63104fae5230cc Mon Sep 17 00:00:00 2001 From: Esteban Zimanyi Date: Tue, 23 Jun 2026 10:27:41 +0200 Subject: [PATCH 4/4] Lint @sqlfn names that collide only by case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add lint_sqlfn_case_collisions next to lint_ea_sqlfn: it groups every @sqlfn (and sqlfnAll) name by its lower-case form and flags any group with more than one spelling (e.g. tDistance vs tdistance), wired into run.py as a loud warning. Such names are the SAME SQL function — PostgreSQL folds unquoted identifiers to lower case — so the clash is invisible in SQL and pg_regress. But the binding name is case-sensitive, and case-insensitive engines (Spark SQL) register every spelling under one UDF, so one silently shadows the other (e.g. the trgeometry tdistance shadowing the tgeo tDistance dispatch, returning NULL). Surfacing it at catalog generation lets the casing straggler be fixed at the MEOS-C @sqlfn source before it reaches any binding. --- parser/sqlfn.py | 18 ++++++++++++++++++ run.py | 13 ++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/parser/sqlfn.py b/parser/sqlfn.py index 6391df2..0c0fcf1 100644 --- a/parser/sqlfn.py +++ b/parser/sqlfn.py @@ -138,3 +138,21 @@ def lint_ea_sqlfn(idl): if sf and m and re.match(r"^[ea][A-Z]", sf) and sf[0] != m.group(1): bad.append((f["name"], sf)) return bad + + +def lint_sqlfn_case_collisions(idl): + """Return [(lower, [spelling, ...])] for @sqlfn names that collide + case-insensitively but differ in case (e.g. tDistance vs tdistance). + + PostgreSQL folds unquoted identifiers to lower case, so the two spell the + SAME SQL function and the clash is invisible in SQL / pg_regress. But the + binding name is taken case-SENSITIVELY, and case-insensitive engines (Spark + SQL, …) register every spelling under one UDF — so one silently shadows the + other. A canonical binding name must have exactly ONE spelling; surface a + casing straggler here before it reaches a binding.""" + by_lower = {} + for f in idl["functions"]: + for sf in [f.get("sqlfn"), *f.get("sqlfnAll", [])]: + if sf: + by_lower.setdefault(sf.lower(), set()).add(sf) + return sorted((lo, sorted(sp)) for lo, sp in by_lower.items() if len(sp) > 1) diff --git a/run.py b/run.py index 3812565..2d59ed3 100644 --- a/run.py +++ b/run.py @@ -6,7 +6,7 @@ from parser.parser import parse_all_headers, merge_meta from parser.portable import attach_portable_aliases from parser.typerecover import recover_collapsed_types -from parser.sqlfn import attach_sqlfn_map, lint_ea_sqlfn +from parser.sqlfn import attach_sqlfn_map, lint_ea_sqlfn, lint_sqlfn_case_collisions HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include") @@ -62,6 +62,17 @@ def main(): f"(fix at source — wrong @sqlfn resolved):", file=sys.stderr) for cname, sf in ea_bad: print(f" {cname} -> @sqlfn {sf}", file=sys.stderr) + # Guard: @sqlfn names that differ only by case (e.g. tDistance vs tdistance) + # are the SAME SQL function (PostgreSQL folds the identifier) but DISTINCT + # binding names — a case-insensitive engine (Spark SQL) registers both under + # one UDF, so one silently shadows the other. Invisible in SQL; surface the + # casing straggler here, to be fixed at the MEOS-C @sqlfn source. + case_bad = lint_sqlfn_case_collisions(idl) + if case_bad: + print(f" ⚠ {len(case_bad)} @sqlfn case-collision(s) (pick ONE canonical " + f"spelling at the MEOS-C source — binding-breaking otherwise):", file=sys.stderr) + for _lo, spellings in case_bad: + print(f" {' vs '.join(spellings)}", file=sys.stderr) idl_path = OUTPUT_DIR / "meos-idl.json" with open(idl_path, "w") as f: