Examples — Satsuma

Legacy Customer Migration

SQL Server 2008 to PostgreSQL 16 with PII handling, type mapping, enum normalization, and UUID generation for 2.4M customer records.

database PII migration

mapping `customer migration` {
  source { `legacy_sqlserver` }
  target { `postgres_db` }

  CUST_ID -> customer_id { uuid_v5("6ba7b810-...", CUST_ID) }
  CUST_ID -> legacy_customer_id

  CUST_TYPE -> customer_type {
    map {
      R: "retail"
      B: "business"
      G: "government"
      null: "retail"
    }
  }

  FIRST_NM -> first_name  { trim | title_case | null_if_empty }
  LAST_NM  -> last_name   { trim | title_case | null_if_empty }
  EMAIL_ADDR -> email { trim | lowercase | validate_email | null_if_invalid }
}

Salesforce to Snowflake

Maps Sales Cloud Opportunity and Account objects into a Snowflake analytics warehouse with currency conversion and stage normalization.

database CRM analytics

mapping `opportunity ingestion` {
  source { `sfdc_opportunity`, `fx_spot_rates` }
  target { `snowflake_opps` }

  Id -> opp_key
  AccountId -> account_key
  Name -> opportunity_name

  Amount -> amount_usd {
    "Multiply by rate from `fx_spot_rates` using CurrencyIsoCode"
    | round(2)
  }

  StageName -> pipeline_stage {
    map {
      Prospecting:   "top_funnel"
      Qualification: "mid_funnel"
      Closed_Won:    "closed_won"
      _:             "unknown"
    }
  }
}

COBOL to Avro

Transforms a mainframe COBOL customer master record into an Avro customer-change event for Kafka. Handles COMP-3 packed decimals, REDEFINES, and EBCDIC encoding.

legacy mainframe streaming

schema cobol_customer_master (format copybook, encoding ebcdic) {
  CUST_ID      INTEGER  (pic "9(10)", offset 1, length 10, required)
  CUST_TYPE    STRING   (pic "X(1)", offset 11, enum {R, B})

  PERSONAL_NAME record (offset 12, length 40) {
    FIRST_NAME STRING   (pic "X(20)", offset 12)
    LAST_NAME  STRING   (pic "X(20)", offset 32)
  }

  BUSINESS_NAME record (redefines PERSONAL_NAME, offset 12) {
    COMPANY_NAME STRING (pic "X(40)", offset 12)
  }

  ADDRESS record (offset 52, length 57) {
    STREET     STRING   (pic "X(30)", offset 52)
    CITY       STRING   (pic "X(20)", offset 82)
    STATE      STRING   (pic "X(2)", offset 102)
  }
}

Protobuf to Parquet

Consumes protobuf commerce events from Kafka and produces session-level Parquet for analytics. Demonstrates group-by aggregation and error-handling defaults.

protobuf streaming aggregation

mapping `session aggregation` (group_by session_id, on_error log, error_threshold 0.02) {
  source { `commerce_event_pb` }
  target { `commerce_session_parquet` }

  session_id -> session_id
  user_id -> user_id { first }

  event_ts -> session_start_utc { min | "Parse minimum timestamp as UTC." }
  event_ts -> session_end_utc { max | "Parse maximum timestamp as UTC." }

  source_system -> source_system { first }
  country_code -> country_code { first | uppercase }
  Context.entry_channel -> entry_channel { first | lowercase }
  Context.device_type -> device_type { first | lowercase }
}

EDI to JSON

Transforms EDI 856 (DESADV) fixed-length ASN messages into JSON for warehouse ingestion. Demonstrates qualifier-driven filter, positional correlation with each, implied decimals, and data gap documentation.

EDI / EDIFACT filter fixed-length

schema edi_desadv (format fixed-length) {
  BeginningOfMessage record {
    DOCNUM    CHAR(35)         // despatch advice number
    MESSGFUN  CHAR(3)          // message function: 9 = Original
  }

  ShipmentRefs list_of record (filter SHPRFQUAL == "SRN") {
    SHPRFQUAL CHAR(3)
    SHIPREF   CHAR(70)         // shipment reference number
  }

  POReferences list_of record (filter REFQUAL == "ON") {
    REFQUAL   CHAR(3)
    REFNUM    CHAR(35)         // PO Number + "/" + Dissection No
  }

  LineItems list_of record {
    ITEMQTY   NUMBER(15)       //! 4 implied decimal places
  }
}

XML to Parquet

Ingests namespace-qualified commerce order XML from an integration bus and lands curated Parquet datasets for order headers and line items.

XML lakehouse XPath

schema commerce_order (
  format xml,
  namespace ord "http://example.com/commerce/order/v2",
  namespace com "http://example.com/common/v1"
) {
  Order record (xpath "/ord:OrderMessage/ord:Order") {
    OrderId        STRING  (xpath "ord:OrderId")
    Channel        STRING  (xpath "ord:Channel")
    CurrencyCode   STRING  (xpath "ord:Currency/com:Code")

    Customer record {
      CustomerId   STRING  (xpath "ord:CustomerId")
      LoyaltyTier  STRING  (xpath "ord:LoyaltyTier")
    }

    LineItems list_of record (xpath "ord:LineItems/ord:LineItem") {
      SKU          STRING       (xpath "ord:SKU")
      Quantity     INT32        (xpath "ord:Quantity")
      UnitPrice    DECIMAL(12,2) (xpath "ord:UnitPrice")
    }
  }
}

SAP PO to MFCS

Maps SAP ERP purchase orders into Oracle MFCS ingestion contracts. A business-level mapping with explicit cross-reference resolution notes.

ERP SAP manufacturing

schema sap_purchase_order (format sap) {
  EBELN    STRING(10)  (pk, required, note "SAP purchase order number")
  BUKRS    STRING(4)   (required, note "Company code")
  BSART    STRING(4)   (required, enum {NB, UB, FO, ZST})
  LIFNR    STRING(10)  (required, note "Vendor account number")
  WAERS    STRING(3)   (required, note "Document currency")

  Items list_of record {
    EBELP    STRING(5)   (pk, required, note "Item number")
    MATNR    STRING(18)  (note "Material number")
    MENGE    DECIMAL(13,3) (required)
    NETPR    DECIMAL(11,2) (note "Net price")
    MEINS    STRING(3)   (required, note "Unit of measure")
    WERKS    STRING(4)   (required, note "Plant")
  }
}

Multi-Source Hub

Aggregates customer, payment, and inventory data from three independent sources and distributes to analytics, notification, and reporting targets.

multi-source hub analytics

mapping `payments to analytics` {
  source { `payment_gateway` }
  target { `analytics_db` }

  amount -> total_spent {
    "Sum all transactions where status = 'completed',
     grouped by customer_email.
     Join to crm_system on email to resolve customer_id."
  }

  -> transaction_count {
    "Count transactions where status = 'completed', per customer."
  }

  -> last_transaction_date {
    "Max transaction timestamp where status = 'completed',
     per customer."
  }
}

Multi-Source Join

Combines CRM profiles, order history, and support tickets into a Customer 360 view. Demonstrates filtered multi-source joins with aggregation.

multi-source join customer-360

mapping `customer 360` {
  source {
    `crm_customers`      (filter "email NOT LIKE '%@test.internal'")
    `order_transactions`  (filter "status IN ('completed', 'refunded')")
    `support_tickets`     (filter "created_at >= date_sub(now(), 12 month)")
    "Join crm_customers to order_transactions on customer_id (left join).
     Join crm_customers to support_tickets on customer_id (left join)."
  }
  target { `customer_360` }

  crm_customers.customer_id -> customer_id
  crm_customers.email -> email { trim | lowercase }
  crm_customers.segment -> segment { lowercase }

  -> full_name {
    "Concat first_name + ' ' + last_name. If both null, use company_name."
  }
}

Namespace Basics

Multi-team retail platform with POS, e-commerce, and warehouse namespaces. Shows global fragments, cross-namespace references, and hub loading.

namespace fragment

fragment audit_fields {
  created_at    TIMESTAMPTZ  (required)
  updated_at    TIMESTAMPTZ
  etl_batch_id  VARCHAR(50)
}

namespace warehouse (note "Data vault hub layer") {
  schema hub_store {
    store_hk     BINARY(32)   (pk)
    store_id     VARCHAR(20)  (required, unique)
    ...audit_fields
  }

  mapping `load hub_store` {
    source { `pos::stores` }
    target { `hub_store` }

    STORE_ID -> store_hk   { md5(STORE_ID) }
    STORE_ID -> store_id
    STORE_NAME -> store_name { trim }
  }
}

Enterprise Platform

Multi-system retail platform (POS, e-commerce, warehouse namespaces) importing definitions from namespaces.stm. Demonstrates cross-namespace hub loading and derived arrows.

namespace import

import { pos::stores, pos::transactions } from "./namespaces.stm"
import { ecom::orders, ecom::customers } from "./namespaces.stm"

transform dv_hash_key {
  trim | lowercase | md5
}

namespace vault (note "Data Vault 2.0 — hubs, links, and satellites") {
  schema hub_contact {
    contact_hk   BINARY(32)   (pk)
    contact_bk   VARCHAR(36)  (required, unique)
    ...standard_metadata
  }

  schema sat_contact_details (note "Contact PII satellite") {
    contact_hk   BINARY(32)   (pk, ref hub_contact.contact_hk)
    load_ts      TIMESTAMPTZ  (pk)
    email        VARCHAR(255) (pii)
    phone        VARCHAR(30)  (pii)
  }
}

Namespace Merging

Same namespace appearing multiple times in one file with cross-namespace references between merged blocks. Tests global shadowing patterns.

namespace merging cross-ref

namespace staging (note "Cleaned and conformed staging area") {
  // Map HR employees to staging — source is in merged namespace block
  mapping `stage employees` {
    source { `source::hr_employees` }
    target { `stg_employees` }

    employee_id -> employee_hk  { hash_key }
    employee_id -> employee_id
    full_name   -> full_name    { clean_string }
    email       -> email        { trim | lowercase }
    department  -> department   { clean_string }

    -> is_active {
      "True if no termination record exists in HR system"
    }
  }
}

Shared Fragments

Shared library: reusable fragments (address fields, audit columns) and reference schemas for country codes, currency rates, and product catalog. Used by multiple workspace scenarios.

fragment lookup library

schema country_codes (note "ISO 3166 country code mapping") {
  alpha2  STRING(2)    (pk)
  alpha3  STRING(3)
  name    STRING(100)
}

schema currency_rates (note "Daily FX rates") {
  from_ccy  ISO-4217       (pk)
  to_ccy    ISO-4217       (pk)
  rate      DECIMAL(18,8)
  as_of     DATE
}

fragment `address fields` {
  line1        STRING(200)  (required)
  line2        STRING(200)
  city         STRING(100)  (required)
  state        STRING(50)
  postal_code  STRING(20)   (required)
  country      ISO-3166-a2  (required)
}

Merge Strategies

All four merge patterns in one retail scenario: upsert, append-only event log, soft delete, and full refresh. Demonstrates merge, match_on, and delete_flag metadata tokens.

merge upsert soft-delete

mapping `customer upsert` (merge upsert, match_on customer_id) {
  source { `crm_customers` }
  target { `dim_customer` }

  customer_id -> customer_id
  full_name   -> full_name   { trim | title_case }
  email       -> email       { trim | lowercase }
  -> updated_at  { now_utc() }
}

mapping `purchase events` (merge append) {
  source { `pos_events` }
  target { `fact_purchases` }

  event_id         -> event_id
  customer_id      -> customer_id
  sku              -> sku
  quantity         -> quantity
  -> ingested_at { now_utc() }
}

Governance Metadata

Customer-360 scenario with full governance annotations: owner, steward, compliance, retention, PII marking, encryption, masking, and org-specific tokens.

governance PII compliance

schema crm_customers (
  owner "crm-data-team",
  steward "maria.garcia@company.com",
  classification "CONFIDENTIAL",
  compliance {GDPR, CCPA},
  retention "7y"
) {
  customer_id  UUID         (pk, required)
  email        STRING(255)  (pii, classification "RESTRICTED",
    encrypt AES-256-GCM, mask partial_email,
    retention "3y")
  first_name   STRING(100)  (pii, classification "CONFIDENTIAL", mask redact)
  phone        STRING(20)   (pii, classification "RESTRICTED",
    encrypt AES-256-GCM, mask last_four)
}

JSON API to Parquet

Maps a REST API order response to flat Parquet using jsonpath metadata. Parallels the XML-to-Parquet example — same scenario, different source format.

JSON jsonpath lakehouse

schema api_order_response (format json) {
  Order record (jsonpath "$.order") {
    OrderId        STRING  (jsonpath "$.order.order_id")
    Channel        STRING  (jsonpath "$.order.channel")
    CurrencyCode   STRING  (jsonpath "$.order.currency")

    Customer record {
      CustomerId   STRING  (jsonpath "$.order.customer.id")
      Email        STRING  (jsonpath "$.order.customer.email")
    }

    LineItems list_of record (jsonpath "$.order.line_items[*]") {
      SKU          STRING        (jsonpath ".sku")
      Quantity     INT32         (jsonpath ".quantity")
      UnitPrice    DECIMAL(12,2) (jsonpath ".unit_price")
    }
  }
}

Reports & ML Models

Declares BI dashboards and ML models as pipeline consumers using report and model tokens with source dependencies, tool/registry metadata, and refresh schedules.

report model lineage

schema weekly_sales_dashboard (
  report,
  source {fact_orders, dim_product},
  tool looker,
  dashboard_id "retail-weekly-001",
  refresh schedule "Monday 06:00 UTC"
) {
  total_revenue    DECIMAL(14,2)
  units_sold       INTEGER
  avg_order_value  DECIMAL(10,2)
  top_product      VARCHAR(200)
}

schema churn_predictor (
  model,
  source {dim_customer, fact_orders},
  registry mlflow,
  experiment "churn-v3"
) {
  order_count_30d        INTEGER
  avg_order_value        DECIMAL(10,2)
  days_since_last_order  INTEGER
  churn_probability      DECIMAL(5,4)
}

Governance Patterns

Demonstrates scalar list fields, filtered and flattened lists, and governance metadata with classification, retention, and PII annotations.

governance filter flatten

schema order_events (
  classification "INTERNAL"
) {
  event_id      UUID          (pk, required)
  event_time    TIMESTAMPTZ   (required)
  order_status  STRING(20)    (enum {completed, pending, failed, refunded})

  customer record {
    customer_id UUID          (required)
    email       STRING(255)   (pii, classification "RESTRICTED", retention "3y")
    tier        STRING(20)    (enum {standard, silver, gold, platinum})
  }

  promo_codes list_of STRING
  tag_ids     list_of INT    (classification "INTERNAL")

  line_items list_of record (filter item_status != "cancelled") {
    sku         STRING(30)    (required)
    quantity    INT           (required)
    unit_price  DECIMAL(12,2) (required)
  }
}

Metric Definitions

Business metric specifications for MRR, CLV, churn rate, and conversion rate. Metric schemas decorate a standard schema with metric metadata and express data flow via mapping blocks.

metric schema analytics lineage

schema monthly_recurring_revenue (
  metric,
  metric_name "MRR",
  source fact_subscriptions,
  grain monthly,
  slice {customer_segment, product_line, region},
  filter "status = 'active' AND is_trial = false"
) {
  value  DECIMAL(14,2)  (measure additive)
}

schema churn_rate (
  metric,
  metric_name "Churn Rate",
  grain monthly,
  slice {segment, region, product_line}
) {
  value  DECIMAL(5,4)  (measure non_additive)
}

schema customer_lifetime_value (
  metric,
  metric_name "CLV",
  source {fact_orders, dim_customer},
  slice {acquisition_channel, segment, cohort_year}
) {
  value          DECIMAL(14,2)  (measure non_additive)
  order_count    INTEGER        (measure additive)
  avg_order_value DECIMAL(12,2) (measure non_additive)
}

Metric Sources

Fact and dimension tables referenced by metric schemas. Maps the underlying source schemas for subscriptions, orders, opportunities, and sessions.

metric schema fact-table dimension

schema fact_subscriptions (note "Active and historical subscription records") {
  subscription_id STRING(36)   (pk)
  customer_id     STRING(36)   (ref dim_customer.customer_id)
  product_line    STRING(50)
  amount          DECIMAL(14,2)
  billing_period  STRING(10)   (enum {monthly, quarterly, annual})
  status          STRING(20)   (enum {active, cancelled, trial})
  started_at      TIMESTAMPTZ
  cancelled_at    TIMESTAMPTZ
}

schema fact_orders (note "Commerce order header records") {
  order_id        STRING(36)   (pk)
  customer_id     STRING(36)   (ref dim_customer.customer_id)
  order_date      DATE
  total_amount    DECIMAL(14,2)
  is_gift         BOOLEAN      (default false)
  loyalty_tier    STRING(20)
}

Example Gallery