dabstep

Synced from hadley/dabstep

A real-world data-dict.yaml, synced from hadley/dabstep. Download the raw file.

dabstep.yaml
# source: https://github.com/hadley/dabstep

version: 0.1.0

tables:
  payments:
    description: >
      Synthetic dataset of payment transactions processed by a payment
      processor. Each row represents one transaction.

    source:
      parquet: data/payments.parquet

    columns:
      - name: psp_reference
        type: number(id)
        constraints: [primary_key]
        description: Unique payment identifier.
        examples: [10000636248, 29933254267, 49981130818, 70032289018, 89998294921]
      - name: merchant
        type: enum
        values: [Belles_cookbook_store, Crossfit_Hanna, Golfclub_Baron_Friso, Martinis_Fine_Steakhouse, Rafa_AI]
        constraints: [required]
        description: Merchant name.
      - name: card_scheme
        type: enum
        values: [GlobalCard, NexPay, SwiftCharge, TransactPlus]
        constraints: [required]
        description: Card scheme (network) used for the transaction.
      - name: year
        type: number(ordinal)
        constraints: [required]
        range: [2023, 2023]
        description: Year of payment initiation. Always 2023 in this dataset.
      - name: hour_of_day
        type: number(ordinal)
        constraints: [required]
        range: [0, 23]
        description: Hour of the day of payment initiation (0-23).
      - name: minute_of_hour
        type: number(ordinal)
        constraints: [required]
        range: [0, 59]
        description: Minute of the hour of payment initiation (0-59).
      - name: day_of_year
        type: number(ordinal)
        constraints: [required]
        range: [1, 365]
        description: Day of the year of payment initiation (1-365).
      - name: month
        type: number(ordinal)
        constraints: [required]
        range: [1, 12]
        description: Month of payment initiation (1-12).
      - name: is_credit
        type: boolean
        constraints: [required]
        description: >
          Whether the card used is a credit card. FALSE indicates a debit
          card. Credit transactions are typically more expensive.
      - name: eur_amount
        type: number(quantity)
        constraints: [required]
        range: [0.5, 4811.76]
        description: Payment amount in euros.
      - name: ip_country
        type: enum
        values: [BE, ES, FR, GR, IT, LU, NL, SE]
        constraints: [required]
        description: >
          Country the shopper was in at the time of the transaction,
          determined by IP address. ISO 3166-1 alpha-2 code.
      - name: issuing_country
        type: enum
        values: [BE, ES, FR, GR, IT, LU, NL, SE]
        constraints: [required]
        description: >
          Country of the card-issuing bank. ISO 3166-1 alpha-2 code. When
          this matches acquirer_country, the transaction is domestic
          (intracountry).
      - name: device_type
        type: enum
        values: [Android, iOS, Linux, MacOS, Other, Windows]
        constraints: [required]
        description: Device type used for the transaction.
      - name: ip_address
        type: string
        constraints: [required]
        description: Hashed shopper IP address.
        examples: [__-7u3f9uJKdXS3u_2ukdQ, bYuXXkz9nZuRjS0r9dT-_w, jyxXoQ7r78mNUH0BnlngEw, s-xemKxVf5o0K46vaODxhA, ZzZYJhXWE4MTCE9vcHnCdQ]
      - name: email_address
        type: string
        description: Hashed shopper email address. May be empty.
        examples: [__-8prajRhB4WeqqIoY5PA, Byq-k5LkiiVbiYjHS95XsQ, jvXaD_MZf_Ljdf7UvAUMpQ, Rz-JZg73tHtXbVEj0wkr_g, ZZzHkjMLX3_pzQtc1HmvBg]
      - name: card_number
        type: string
        constraints: [required]
        description: Hashed card number.
        examples: [__8Fy8ayxoTnl0EX9DtIhw, c8fAIVb46qwMwy1_z2TpxA, k97dMMQVxeLQjnWvk0hy3Q, s24En6cJp3dNBoLHXSWiWg, zzzp-SYj0U6e5r2ioWVniQ]
      - name: shopper_interaction
        type: enum
        values:
          Ecommerce: Online transactions
          POS: In-person or in-store transactions
        constraints: [required]
        description: Payment method.
      - name: card_bin
        type: number(id)
        constraints: [required]
        description: Bank Identification Number (first digits of card number).
        examples: [4017, 4526, 4571, 4802, 4920]
      - name: has_fraudulent_dispute
        type: boolean
        constraints: [required]
        description: Whether the issuing bank flagged the transaction as a fraudulent dispute.
      - name: is_refused_by_adyen
        type: boolean
        constraints: [required]
        description: Whether the transaction was refused by Adyen.
      - name: aci
        type: enum
        values:
          A: Card present, non-authenticated
          B: Card present, authenticated
          C: Tokenized with mobile device
          D: Card not present, card on file
          E: Card not present, recurring
          F: Card not present, 3-D Secure
          G: Card not present, non-3-D Secure
        constraints: [required]
        description: >
          Authorization Characteristics Indicator. Identifies how the
          transaction was submitted to the acquirer.
      - name: acquirer_country
        type: enum
        values: [FR, GB, IT, NL, US]
        constraints: [required]
        description: >
          Country of the acquiring bank. ISO 3166-1 alpha-2 code.

  acquirer_countries:
    description: >
      Lookup table mapping acquirer names to their country codes.

    source:
      parquet: data/acquirer_countries.parquet

    columns:
      - name: acquirer
        type: string
        constraints: [primary_key]
        description: Name of the acquiring bank.
        examples: [bank_of_springfield, dagoberts_vault, gringotts, medici, the_savings_and_loan_bank]
      - name: country_code
        type: enum
        values: [FR, GB, IT, NL, US]
        constraints: [required]
        description: Country of the acquirer. ISO 3166-1 alpha-2 code.

  merchants:
    description: >
      Merchant profiles with business classification and processing
      configuration. Each row represents one merchant.

    source:
      parquet: data/merchants.parquet

    columns:
      - name: merchant
        type: string
        constraints: [primary_key]
        description: Merchant name.
        examples: [AI_Pro_Labs, Crafty_Cuisine, Gym_Pro, Read_and_Co, Yoga_Masters]
      - name: capture_delay
        type: enum
        values:
          immediate: Immediate capture
          "<3": Under 3 days
          "3-5": 3 to 5 days
          ">5": Over 5 days
          manual: Manual capture
        constraints: [required]
        description: >
          Capture delay range. Faster capture is generally more expensive.
      - name: merchant_category_code
        type: number(id)
        constraints: [required, foreign_key]
        description: Four-digit Merchant Category Code classifying the business type.
        examples: [5812, 5814, 7372, 7997, 8299]
      - name: account_type
        type: enum
        values:
          D: Enterprise - Digital
          F: Platform - Franchise
          H: Enterprise - Hospitality
          R: Enterprise - Retail
          S: Platform - SaaS
        constraints: [required]
        description: Business model classification.

  merchant_acquirers:
    description: >
      Junction table linking merchants to their acquiring banks. A merchant
      may route transactions through multiple acquirers (typically in
      different countries) to enable local acquiring.

    source:
      parquet: data/merchant_acquirers.parquet

    columns:
      - name: merchant
        type: string
        constraints: [required, foreign_key]
        description: Merchant name.
        examples: [AI_Pro_Labs, Crafty_Cuisine, Gym_Pro, Read_and_Co, Yoga_Masters]
      - name: acquirer
        type: string
        constraints: [required, foreign_key]
        description: Name of the acquiring bank.
        examples: [bank_of_springfield, dagoberts_vault, gringotts, medici, the_savings_and_loan_bank]

  fees:
    description: >
      Fee rules for payment processing. Each row is a rule with constraints
      that determine which transactions it applies to. A rule matches a
      transaction when all non-null fields match. Null (scalar) or empty
      (list) means "matches all values." The fee for a matched transaction
      is: fixed_amount + rate * eur_amount / 10000.

    source:
      parquet: data/fees.parquet

    columns:
      - name: ID
        type: number(id)
        constraints: [primary_key]
        description: Unique identifier for the fee rule.
        examples: [1, 251, 500, 750, 1000]
      - name: card_scheme
        type: enum
        values: [GlobalCard, NexPay, SwiftCharge, TransactPlus]
        constraints: [required]
        description: Card scheme this rule applies to. Always set (never null).
      - name: account_type
        type: enum
        values:
          D: Enterprise - Digital
          F: Platform - Franchise
          H: Enterprise - Hospitality
          R: Enterprise - Retail
          S: Platform - SaaS
        description: >
          Account types this rule applies to. Empty list means all
          account types.
      - name: capture_delay
        type: enum
        values:
          immediate: Immediate capture
          "<3": Under 3 days
          "3-5": 3 to 5 days
          ">5": Over 5 days
          manual: Manual capture
        description: >
          Capture delay range this rule applies to. Null means all.
          Matches the same enum used in the merchants table.
      - name: fraud_percent_min
        type: number(quantity)
        constraints: [required]
        range: [7.2, 8.3]
        description: >
          Minimum monthly fraud level (%) this rule applies to (inclusive).
          -Inf means no lower bound.
      - name: fraud_percent_max
        type: number(quantity)
        constraints: [required]
        range: [7.2, 8.3]
        description: >
          Maximum monthly fraud level (%) this rule applies to (exclusive).
          Inf means no upper bound.
      - name: monthly_volume
        type: enum
        values:
          "<100k": Under 100k euros
          100k-1m: 100k to 1m euros
          1m-5m: 1m to 5m euros
          ">5m": Over 5m euros
        description: >
          Monthly transaction volume range this rule applies to.
          Null means all.
      - name: merchant_category_code
        type: number(id)
        examples: [3000, 5499, 5983, 7922, 9399]
        description: >
          MCCs this rule applies to. Empty list means all MCCs.
      - name: is_credit
        type: boolean
        description: >
          Whether this rule applies to credit (TRUE) or debit (FALSE)
          transactions. Null means both.
      - name: aci
        type: enum
        values:
          A: Card present, non-authenticated
          B: Card present, authenticated
          C: Tokenized with mobile device
          D: Card not present, card on file
          E: Card not present, recurring
          F: Card not present, 3-D Secure
          G: Card not present, non-3-D Secure
        description: >
          ACI values this rule applies to. Empty list means all ACI
          values.
      - name: fixed_amount
        type: number(quantity)
        constraints: [required]
        range: [0, 0.14]
        description: >
          Fixed fee component (also called "absolute fee") in euros
          per transaction.
      - name: rate
        type: number(quantity)
        constraints: [required]
        range: [10, 99]
        description: >
          Variable fee rate (also called "relative fee"). Multiply by
          transaction value and divide by 10000 to get the variable
          component in euros.
      - name: intracountry
        type: boolean
        description: >
          Whether this rule applies to domestic (TRUE) or international
          (FALSE) transactions. A transaction is domestic when
          issuing_country equals acquirer_country. Null means both.

  merchant_months:
    description: >
      Monthly aggregates per merchant, computed from the payments data.
      Used for matching payments to fee rules based on monthly fraud
      level and transaction volume thresholds.

    source:
      parquet: data/merchant_months.parquet

    columns:
      - name: merchant
        type: string
        constraints: [primary_key, foreign_key]
        description: Merchant name.
        examples: [Belles_cookbook_store, Crossfit_Hanna, Golfclub_Baron_Friso, Martinis_Fine_Steakhouse, Rafa_AI]
      - name: year
        type: number(ordinal)
        constraints: [primary_key]
        range: [2023, 2023]
        description: Year.
      - name: month
        type: number(ordinal)
        constraints: [primary_key]
        range: [1, 12]
        description: Month (1-12).
      - name: total_volume
        type: number(quantity)
        constraints: [required]
        range: [91877.05, 438023.59]
        description: Total transaction volume in euros for this merchant-month.
      - name: fraud_volume
        type: number(quantity)
        constraints: [required]
        range: [5612.78, 42617.6]
        description: >
          Total volume of transactions flagged as fraudulent disputes,
          in euros.
      - name: fraud_percent
        type: number(quantity)
        constraints: [required]
        range: [6.09, 11.62]
        description: >
          Fraud rate as a percentage: fraud_volume / total_volume * 100.
      - name: monthly_volume
        type: enum
        values:
          "<100k": Under 100k euros
          100k-1m: 100k to 1m euros
          1m-5m: 1m to 5m euros
          ">5m": Over 5m euros
        constraints: [required]
        description: >
          Binned total_volume matching the fee rule volume ranges.

  merchant_category_codes:
    description: >
      Lookup table of Merchant Category Codes (MCCs). MCCs are four-digit
      codes assigned by card networks to classify a merchant's business type.

    source:
      parquet: data/merchant_category_codes.parquet

    columns:
      - name: mcc
        type: number(id)
        constraints: [primary_key]
        description: Four-digit Merchant Category Code.
        examples: [742, 3286, 3679, 5732, 9950]
      - name: description
        type: string
        constraints: [required]
        description: Human-readable description of the merchant category.
        examples: [Concrete Work Contractors, Eating Places Restaurants, Industrial Supplies, Recreation Services, Veterinary Services]

relationships:
  - description: Each payment belongs to one merchant.
    cardinality: many-to-one
    join: payments.merchant = merchants.merchant

  - description: Each merchant has one or more acquirers.
    cardinality: one-to-many
    join: merchants.merchant = merchant_acquirers.merchant

  - description: Each merchant-acquirer references one acquirer.
    cardinality: many-to-one
    join: merchant_acquirers.acquirer = acquirer_countries.acquirer

  - description: Each merchant has monthly aggregates for each active month.
    cardinality: one-to-many
    join: merchants.merchant = merchant_months.merchant

  - description: Each merchant has one merchant category code.
    cardinality: many-to-one
    join: merchants.merchant_category_code = merchant_category_codes.mcc

glossary:
  acquirer: >
    The acquiring bank that processes a merchant's card transactions and
    settles funds. Also called the "acquiring bank" or "merchant bank."
  card scheme: >
    The card network (e.g. GlobalCard, NexPay) that sets rules and
    interchange rates for transactions on its network.
  ACI: >
    Authorization Characteristics Indicator. A code that identifies the
    transaction flow submitted to the acquirer (e.g. card-present vs
    card-not-present, authenticated vs non-authenticated).
  MCC: >
    Merchant Category Code. A four-digit code assigned by card networks to
    classify a merchant's business type, used for risk assessment, fraud
    detection, and fee determination.
  intracountry: >
    A domestic transaction where the card-issuing country and the acquiring
    country are the same. Intracountry transactions typically have lower fees
    than cross-border transactions.
  BIN: >
    Bank Identification Number. The first six to eight digits of a card
    number that identify the issuing bank.
  POS: >
    Point of Sale. An in-person or in-store transaction, as opposed to an
    online (ecommerce) transaction.
  3-D Secure: >
    A payment security protocol that adds an extra authentication step for
    online card-not-present transactions, reducing fraud risk.
  chargeback: >
    A reversal of a transaction initiated by the cardholder's issuing bank,
    typically due to a dispute or fraud claim.
  capture delay: >
    The number of days between authorization and settlement of a
    transaction. Faster capture is generally more expensive.