dabstep
Synced from hadley/dabstep
A real-world data-dict.yaml, synced from hadley/dabstep. Download the raw file.
dabstep.yaml
# source: https://github.com/hadley/dabstep
version: 0.1.0
tables:
payments:
description: >
Synthetic dataset of payment transactions processed by a payment
processor. Each row represents one transaction.
source:
parquet: data/payments.parquet
columns:
- name: psp_reference
type: number(id)
constraints: [primary_key]
description: Unique payment identifier.
examples: [10000636248, 29933254267, 49981130818, 70032289018, 89998294921]
- name: merchant
type: enum
values: [Belles_cookbook_store, Crossfit_Hanna, Golfclub_Baron_Friso, Martinis_Fine_Steakhouse, Rafa_AI]
constraints: [required]
description: Merchant name.
- name: card_scheme
type: enum
values: [GlobalCard, NexPay, SwiftCharge, TransactPlus]
constraints: [required]
description: Card scheme (network) used for the transaction.
- name: year
type: number(ordinal)
constraints: [required]
range: [2023, 2023]
description: Year of payment initiation. Always 2023 in this dataset.
- name: hour_of_day
type: number(ordinal)
constraints: [required]
range: [0, 23]
description: Hour of the day of payment initiation (0-23).
- name: minute_of_hour
type: number(ordinal)
constraints: [required]
range: [0, 59]
description: Minute of the hour of payment initiation (0-59).
- name: day_of_year
type: number(ordinal)
constraints: [required]
range: [1, 365]
description: Day of the year of payment initiation (1-365).
- name: month
type: number(ordinal)
constraints: [required]
range: [1, 12]
description: Month of payment initiation (1-12).
- name: is_credit
type: boolean
constraints: [required]
description: >
Whether the card used is a credit card. FALSE indicates a debit
card. Credit transactions are typically more expensive.
- name: eur_amount
type: number(quantity)
constraints: [required]
range: [0.5, 4811.76]
description: Payment amount in euros.
- name: ip_country
type: enum
values: [BE, ES, FR, GR, IT, LU, NL, SE]
constraints: [required]
description: >
Country the shopper was in at the time of the transaction,
determined by IP address. ISO 3166-1 alpha-2 code.
- name: issuing_country
type: enum
values: [BE, ES, FR, GR, IT, LU, NL, SE]
constraints: [required]
description: >
Country of the card-issuing bank. ISO 3166-1 alpha-2 code. When
this matches acquirer_country, the transaction is domestic
(intracountry).
- name: device_type
type: enum
values: [Android, iOS, Linux, MacOS, Other, Windows]
constraints: [required]
description: Device type used for the transaction.
- name: ip_address
type: string
constraints: [required]
description: Hashed shopper IP address.
examples: [__-7u3f9uJKdXS3u_2ukdQ, bYuXXkz9nZuRjS0r9dT-_w, jyxXoQ7r78mNUH0BnlngEw, s-xemKxVf5o0K46vaODxhA, ZzZYJhXWE4MTCE9vcHnCdQ]
- name: email_address
type: string
description: Hashed shopper email address. May be empty.
examples: [__-8prajRhB4WeqqIoY5PA, Byq-k5LkiiVbiYjHS95XsQ, jvXaD_MZf_Ljdf7UvAUMpQ, Rz-JZg73tHtXbVEj0wkr_g, ZZzHkjMLX3_pzQtc1HmvBg]
- name: card_number
type: string
constraints: [required]
description: Hashed card number.
examples: [__8Fy8ayxoTnl0EX9DtIhw, c8fAIVb46qwMwy1_z2TpxA, k97dMMQVxeLQjnWvk0hy3Q, s24En6cJp3dNBoLHXSWiWg, zzzp-SYj0U6e5r2ioWVniQ]
- name: shopper_interaction
type: enum
values:
Ecommerce: Online transactions
POS: In-person or in-store transactions
constraints: [required]
description: Payment method.
- name: card_bin
type: number(id)
constraints: [required]
description: Bank Identification Number (first digits of card number).
examples: [4017, 4526, 4571, 4802, 4920]
- name: has_fraudulent_dispute
type: boolean
constraints: [required]
description: Whether the issuing bank flagged the transaction as a fraudulent dispute.
- name: is_refused_by_adyen
type: boolean
constraints: [required]
description: Whether the transaction was refused by Adyen.
- name: aci
type: enum
values:
A: Card present, non-authenticated
B: Card present, authenticated
C: Tokenized with mobile device
D: Card not present, card on file
E: Card not present, recurring
F: Card not present, 3-D Secure
G: Card not present, non-3-D Secure
constraints: [required]
description: >
Authorization Characteristics Indicator. Identifies how the
transaction was submitted to the acquirer.
- name: acquirer_country
type: enum
values: [FR, GB, IT, NL, US]
constraints: [required]
description: >
Country of the acquiring bank. ISO 3166-1 alpha-2 code.
acquirer_countries:
description: >
Lookup table mapping acquirer names to their country codes.
source:
parquet: data/acquirer_countries.parquet
columns:
- name: acquirer
type: string
constraints: [primary_key]
description: Name of the acquiring bank.
examples: [bank_of_springfield, dagoberts_vault, gringotts, medici, the_savings_and_loan_bank]
- name: country_code
type: enum
values: [FR, GB, IT, NL, US]
constraints: [required]
description: Country of the acquirer. ISO 3166-1 alpha-2 code.
merchants:
description: >
Merchant profiles with business classification and processing
configuration. Each row represents one merchant.
source:
parquet: data/merchants.parquet
columns:
- name: merchant
type: string
constraints: [primary_key]
description: Merchant name.
examples: [AI_Pro_Labs, Crafty_Cuisine, Gym_Pro, Read_and_Co, Yoga_Masters]
- name: capture_delay
type: enum
values:
immediate: Immediate capture
"<3": Under 3 days
"3-5": 3 to 5 days
">5": Over 5 days
manual: Manual capture
constraints: [required]
description: >
Capture delay range. Faster capture is generally more expensive.
- name: merchant_category_code
type: number(id)
constraints: [required, foreign_key]
description: Four-digit Merchant Category Code classifying the business type.
examples: [5812, 5814, 7372, 7997, 8299]
- name: account_type
type: enum
values:
D: Enterprise - Digital
F: Platform - Franchise
H: Enterprise - Hospitality
R: Enterprise - Retail
S: Platform - SaaS
constraints: [required]
description: Business model classification.
merchant_acquirers:
description: >
Junction table linking merchants to their acquiring banks. A merchant
may route transactions through multiple acquirers (typically in
different countries) to enable local acquiring.
source:
parquet: data/merchant_acquirers.parquet
columns:
- name: merchant
type: string
constraints: [required, foreign_key]
description: Merchant name.
examples: [AI_Pro_Labs, Crafty_Cuisine, Gym_Pro, Read_and_Co, Yoga_Masters]
- name: acquirer
type: string
constraints: [required, foreign_key]
description: Name of the acquiring bank.
examples: [bank_of_springfield, dagoberts_vault, gringotts, medici, the_savings_and_loan_bank]
fees:
description: >
Fee rules for payment processing. Each row is a rule with constraints
that determine which transactions it applies to. A rule matches a
transaction when all non-null fields match. Null (scalar) or empty
(list) means "matches all values." The fee for a matched transaction
is: fixed_amount + rate * eur_amount / 10000.
source:
parquet: data/fees.parquet
columns:
- name: ID
type: number(id)
constraints: [primary_key]
description: Unique identifier for the fee rule.
examples: [1, 251, 500, 750, 1000]
- name: card_scheme
type: enum
values: [GlobalCard, NexPay, SwiftCharge, TransactPlus]
constraints: [required]
description: Card scheme this rule applies to. Always set (never null).
- name: account_type
type: enum
values:
D: Enterprise - Digital
F: Platform - Franchise
H: Enterprise - Hospitality
R: Enterprise - Retail
S: Platform - SaaS
description: >
Account types this rule applies to. Empty list means all
account types.
- name: capture_delay
type: enum
values:
immediate: Immediate capture
"<3": Under 3 days
"3-5": 3 to 5 days
">5": Over 5 days
manual: Manual capture
description: >
Capture delay range this rule applies to. Null means all.
Matches the same enum used in the merchants table.
- name: fraud_percent_min
type: number(quantity)
constraints: [required]
range: [7.2, 8.3]
description: >
Minimum monthly fraud level (%) this rule applies to (inclusive).
-Inf means no lower bound.
- name: fraud_percent_max
type: number(quantity)
constraints: [required]
range: [7.2, 8.3]
description: >
Maximum monthly fraud level (%) this rule applies to (exclusive).
Inf means no upper bound.
- name: monthly_volume
type: enum
values:
"<100k": Under 100k euros
100k-1m: 100k to 1m euros
1m-5m: 1m to 5m euros
">5m": Over 5m euros
description: >
Monthly transaction volume range this rule applies to.
Null means all.
- name: merchant_category_code
type: number(id)
examples: [3000, 5499, 5983, 7922, 9399]
description: >
MCCs this rule applies to. Empty list means all MCCs.
- name: is_credit
type: boolean
description: >
Whether this rule applies to credit (TRUE) or debit (FALSE)
transactions. Null means both.
- name: aci
type: enum
values:
A: Card present, non-authenticated
B: Card present, authenticated
C: Tokenized with mobile device
D: Card not present, card on file
E: Card not present, recurring
F: Card not present, 3-D Secure
G: Card not present, non-3-D Secure
description: >
ACI values this rule applies to. Empty list means all ACI
values.
- name: fixed_amount
type: number(quantity)
constraints: [required]
range: [0, 0.14]
description: >
Fixed fee component (also called "absolute fee") in euros
per transaction.
- name: rate
type: number(quantity)
constraints: [required]
range: [10, 99]
description: >
Variable fee rate (also called "relative fee"). Multiply by
transaction value and divide by 10000 to get the variable
component in euros.
- name: intracountry
type: boolean
description: >
Whether this rule applies to domestic (TRUE) or international
(FALSE) transactions. A transaction is domestic when
issuing_country equals acquirer_country. Null means both.
merchant_months:
description: >
Monthly aggregates per merchant, computed from the payments data.
Used for matching payments to fee rules based on monthly fraud
level and transaction volume thresholds.
source:
parquet: data/merchant_months.parquet
columns:
- name: merchant
type: string
constraints: [primary_key, foreign_key]
description: Merchant name.
examples: [Belles_cookbook_store, Crossfit_Hanna, Golfclub_Baron_Friso, Martinis_Fine_Steakhouse, Rafa_AI]
- name: year
type: number(ordinal)
constraints: [primary_key]
range: [2023, 2023]
description: Year.
- name: month
type: number(ordinal)
constraints: [primary_key]
range: [1, 12]
description: Month (1-12).
- name: total_volume
type: number(quantity)
constraints: [required]
range: [91877.05, 438023.59]
description: Total transaction volume in euros for this merchant-month.
- name: fraud_volume
type: number(quantity)
constraints: [required]
range: [5612.78, 42617.6]
description: >
Total volume of transactions flagged as fraudulent disputes,
in euros.
- name: fraud_percent
type: number(quantity)
constraints: [required]
range: [6.09, 11.62]
description: >
Fraud rate as a percentage: fraud_volume / total_volume * 100.
- name: monthly_volume
type: enum
values:
"<100k": Under 100k euros
100k-1m: 100k to 1m euros
1m-5m: 1m to 5m euros
">5m": Over 5m euros
constraints: [required]
description: >
Binned total_volume matching the fee rule volume ranges.
merchant_category_codes:
description: >
Lookup table of Merchant Category Codes (MCCs). MCCs are four-digit
codes assigned by card networks to classify a merchant's business type.
source:
parquet: data/merchant_category_codes.parquet
columns:
- name: mcc
type: number(id)
constraints: [primary_key]
description: Four-digit Merchant Category Code.
examples: [742, 3286, 3679, 5732, 9950]
- name: description
type: string
constraints: [required]
description: Human-readable description of the merchant category.
examples: [Concrete Work Contractors, Eating Places Restaurants, Industrial Supplies, Recreation Services, Veterinary Services]
relationships:
- description: Each payment belongs to one merchant.
cardinality: many-to-one
join: payments.merchant = merchants.merchant
- description: Each merchant has one or more acquirers.
cardinality: one-to-many
join: merchants.merchant = merchant_acquirers.merchant
- description: Each merchant-acquirer references one acquirer.
cardinality: many-to-one
join: merchant_acquirers.acquirer = acquirer_countries.acquirer
- description: Each merchant has monthly aggregates for each active month.
cardinality: one-to-many
join: merchants.merchant = merchant_months.merchant
- description: Each merchant has one merchant category code.
cardinality: many-to-one
join: merchants.merchant_category_code = merchant_category_codes.mcc
glossary:
acquirer: >
The acquiring bank that processes a merchant's card transactions and
settles funds. Also called the "acquiring bank" or "merchant bank."
card scheme: >
The card network (e.g. GlobalCard, NexPay) that sets rules and
interchange rates for transactions on its network.
ACI: >
Authorization Characteristics Indicator. A code that identifies the
transaction flow submitted to the acquirer (e.g. card-present vs
card-not-present, authenticated vs non-authenticated).
MCC: >
Merchant Category Code. A four-digit code assigned by card networks to
classify a merchant's business type, used for risk assessment, fraud
detection, and fee determination.
intracountry: >
A domestic transaction where the card-issuing country and the acquiring
country are the same. Intracountry transactions typically have lower fees
than cross-border transactions.
BIN: >
Bank Identification Number. The first six to eight digits of a card
number that identify the issuing bank.
POS: >
Point of Sale. An in-person or in-store transaction, as opposed to an
online (ecommerce) transaction.
3-D Secure: >
A payment security protocol that adds an extra authentication step for
online card-not-present transactions, reducing fraud risk.
chargeback: >
A reversal of a transaction initiated by the cardholder's issuing bank,
typically due to a dispute or fraud claim.
capture delay: >
The number of days between authorization and settlement of a
transaction. Faster capture is generally more expensive.