This vignette reproduces the Splink
“Linking banking transactions” demo in irelink. It
demonstrates two-table linkage with link_type = "link" by
matching each outgoing payment in the origin table to the corresponding
incoming payment in the destination table.
The data is synthetic and intentionally challenging. Amounts differ
because of fees and exchange-rate effects, dates can shift by a few
days, and memos are sometimes truncated. Because each origin payment has
exactly one destination counterpart, the prior match probability is
1 / n_origin.
This vignette requires nanoparquet to
read the remote Parquet files, and it only compiles when the package and
both data URLs are available. It also assumes DuckDB because the
blocking rules below use raw .where SQL with DuckDB date
helpers such as strftime() and yearweek().
library(irelink)
library(ggplot2)
df_origin
#> # A data frame: 45,326 × 5
#> ground_truth memo transaction_date amount unique_id
#> <dbl> <chr> <date> <dbl> <dbl>
#> 1 0 MATTHIAS C paym 2022-03-28 36.4 0
#> 2 1 M CORVINUS dona 2022-02-14 222. 1
#> 3 2 M C donation BG 2022-05-04 450. 2
#> 4 3 M C BGC 2022-03-03 208. 3
#> 5 4 M CORVINUS CSH 2022-02-04 79.7 4
#> 6 5 M C WRE 2022-03-26 835. 5
#> 7 6 M CORVINUS CSH 2022-05-01 66.7 6
#> 8 7 M C 1b097ab5 CH 2022-03-15 26246. 7
#> 9 8 M C WRE 2022-03-26 92.8 8
#> 10 9 M C payment CHQ 2022-05-04 211. 9
#> # ℹ 45,316 more rows
df_destination
#> # A data frame: 45,326 × 5
#> ground_truth memo transaction_date amount unique_id
#> <dbl> <chr> <date> <dbl> <dbl>
#> 1 0 "MATTHIAS C payment BGC" 2022-03-29 36.4 0
#> 2 1 "M CORVINUS BGC" 2022-02-16 222. 1
#> 3 2 "M C" 2022-05-05 450. 2
#> 4 3 "M C payment" 2022-03-04 199. 3
#> 5 4 "M CORVINUS " 2022-02-05 79.7 4
#> 6 5 "M C " 2022-03-27 835. 5
#> 7 6 "M CORVINUS dona" 2022-05-05 66.7 6
#> 8 7 "M C CHQ" 2022-03-27 25908. 7
#> 9 8 "M C WRE" 2022-03-27 91.9 8
#> 10 9 "M C payment CHQ" 2022-05-17 212. 9
#> # ℹ 45,316 more rowsil_profile(df_origin, memo, transaction_date, amount, con = con, top_n = 8)
#> # A tibble: 24 × 3
#> column value n
#> <chr> <chr> <dbl>
#> 1 memo J B payment BGC 27
#> 2 memo J B donation BG 25
#> 3 memo J B money BGC 24
#> 4 memo J B BGC 21
#> 5 memo J S money BGC 18
#> 6 memo J P BGC 18
#> 7 memo A B money BGC 18
#> 8 memo J C money BGC 17
#> 9 transaction_date 19122 696
#> 10 transaction_date 19119 693
#> # ℹ 14 more rowsBecause corresponding records differ in predictable ways, the
blocking rules need to be broad enough to retain true matches while
still shrinking the search space. Fees change amounts, dates shift, and
memos are truncated, so the rules below use SQL expressions in
.where rather than relying on exact agreement alone:
counts <- il_count_pairs(
df_origin,
df_destination,
# Same year-month, similar memo prefix, amount ratio within 30%
block_on(
.where = paste(
"strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')",
'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)',
'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3'
)
),
# Same but offset by 15 days to catch month boundaries
block_on(
.where = paste(
"strftime(l.transaction_date + 15, '%Y%m') = strftime(r.transaction_date, '%Y%m')",
'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)',
'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3'
)
),
# Memo prefix (first 9 characters)
block_on(.where = 'substr(l.memo, 1, 9) = substr(r.memo, 1, 9)'),
# Rounded amount + same week
block_on(
.where = paste(
'round(l.amount / 2, 0) * 2 = round(r.amount / 2, 0) * 2',
'AND yearweek(r.transaction_date) = yearweek(l.transaction_date)'
)
),
# Amount offset + week offset
block_on(
.where = paste(
'round(l.amount / 2, 0) * 2 = round((r.amount + 1) / 2, 0) * 2',
'AND yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)'
)
),
# Ground-truth "cheat" rule for completeness
block_on(unique_id),
con = con,
link_type = 'link'
)
counts
#> # A tibble: 6 × 4
#> rule n_pairs cumulative_pairs pct_of_cartesian
#> <chr> <dbl> <dbl> <dbl>
#> 1 strftime(l.transaction_date, '%Y%m'… 301614 301614 0.0147
#> 2 strftime(l.transaction_date + 15, '… 281675 428250 0.0208
#> 3 substr(l.memo, 1, 9) = substr(r.mem… 330510 710190 0.0346
#> 4 round(l.amount / 2, 0) * 2 = round(… 353563 1051372 0.0512
#> 5 round(l.amount / 2, 0) * 2 = round(… 352877 1321538 0.0643
#> 6 unique_id 45326 1321565 0.0643The transaction_date comparison is one-sided because a
payment can only arrive after it is sent. The comparison therefore
checks whether destination_date - origin_date is between 0
and N days:
spec <- il_spec() |>
il_compare(amount, cl_pct_diff(0.01, 0.03, 0.10, 0.30)) |>
il_compare(memo, cl_levenshtein(2, 6, 10)) |>
il_compare(
transaction_date,
cl_levels(
cl_null(),
cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 1'),
cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 4'),
cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 10'),
cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 30'),
cl_else()
)
) |>
il_block_on(
.where = paste(
"strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')",
'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)',
'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3'
)
) |>
il_block_on(
.where = paste(
"strftime(l.transaction_date + 15, '%Y%m') = strftime(r.transaction_date, '%Y%m')",
'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)',
'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3'
)
) |>
il_block_on(.where = 'substr(l.memo, 1, 9) = substr(r.memo, 1, 9)') |>
il_block_on(
.where = paste(
'round(l.amount / 2, 0) * 2 = round(r.amount / 2, 0) * 2',
'AND yearweek(r.transaction_date) = yearweek(l.transaction_date)'
)
) |>
il_block_on(
.where = paste(
'round(l.amount / 2, 0) * 2 = round((r.amount + 1) / 2, 0) * 2',
'AND yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)'
)
) |>
il_block_on(unique_id)
spec
#> Linkage Specification
#> Comparisons (3):
#> amount : pct_diff
#> memo : levenshtein
#> transaction_date : levels
#> Blocking rules (6, OR-ed):
#> 1. WHERE strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m') AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3) AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3
#> 2. WHERE strftime(l.transaction_date + 15, '%Y%m') = strftime(r.transaction_date, '%Y%m') AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3) AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3
#> 3. WHERE substr(l.memo, 1, 9) = substr(r.memo, 1, 9)
#> 4. WHERE round(l.amount / 2, 0) * 2 = round(r.amount / 2, 0) * 2 AND yearweek(r.transaction_date) = yearweek(l.transaction_date)
#> 5. WHERE round(l.amount / 2, 0) * 2 = round((r.amount + 1) / 2, 0) * 2 AND yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)
#> 6. unique_idBecause this benchmark is one-to-one, set the prevalence prior
directly with il_prior_prevalence() instead of changing
model$params by hand:
model <- il_model(
df_origin,
df_destination,
spec = spec,
con = con,
link_type = 'link'
)
model <- il_prior_prevalence(model, 1 / nrow(df_origin))
model <- il_estimate_u(model, max_pairs = 1e6) |>
il_estimate_em(block_on(memo)) |>
il_estimate_em(block_on(amount))
#> EM trained: amount and transaction_date | skipped (blocked on): memo
#> EM trained: memo and transaction_date | skipped (blocked on): amountsummary(model)
#> irelink Model
#> Status: Trained
#> Link type: link
#> Records: 45326
#> Records (right): 45326
#> Comparisons: 3
#> Blocking rules: 6
#>
#> Parameters:
#> prior: 0.003723228
#> priors: # A tibble: 1 × 5
#> priors: family comparison gamma_level probability strength
#> priors: <chr> <chr> <int> <dbl> <dbl>
#> priors: 1 prevalence <NA> NA 0.0000221 NA
#> comparisons: # A tibble: 14 × 4
#> comparisons: comparison gamma_level m u
#> comparisons: <chr> <int> <dbl> <dbl>
#> comparisons: 1 amount 0 0.001000 0.875
#> comparisons: 2 amount 1 0.001000 0.0873
#> comparisons: 3 amount 2 0.232 0.0266
#> comparisons: 4 amount 3 0.319 0.00743
#> comparisons: 5 amount 4 0.447 0.00376
#> comparisons: 6 memo 0 0.0500 0.856
#> comparisons: 7 memo 1 0.145 0.111
#> comparisons: 8 memo 2 0.263 0.0279
#> comparisons: 9 memo 3 0.543 0.00486
#> comparisons: 10 transaction_date 0 0.001000 0.719
#> comparisons: 11 transaction_date 1 0.0456 0.168
#> comparisons: 12 transaction_date 2 0.0931 0.0600
#> comparisons: 13 transaction_date 3 0.468 0.0313
#> comparisons: 14 transaction_date 4 0.392 0.0211
#> u_estimation: 1e+06
#> u_estimation: FALSE
#> u_estimation: NULL
#> u_estimation: NULL
#> u_estimation: 1000000
#> u_estimation: 1predictions <- predict(model, threshold = 0.001)
predictions
#> # A tibble: 594,672 × 8
#> unique_id_l unique_id_r gamma_amount gamma_memo gamma_transaction_date
#> * <dbl> <dbl> <int> <int> <int>
#> 1 15235 15235 3 2 3
#> 2 33565 33565 3 1 1
#> 3 35683 35683 4 2 1
#> 4 43391 43391 2 2 1
#> 5 44423 44423 2 1 1
#> 6 273 273 4 3 4
#> 7 307 26617 1 3 2
#> 8 663 24394 2 1 4
#> 9 695 695 3 3 3
#> 10 762 36624 2 0 3
#> # ℹ 594,662 more rows
#> # ℹ 3 more variables: match_weight <dbl>, total_match_weight <dbl>,
#> # match_probability <dbl>acc <- il_accuracy(model, labels_col = 'ground_truth')
acc
#> # A tibble: 102 × 16
#> threshold tp fp fn tn fn_blocking_miss precision recall f1
#> <dbl> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 0 45326 1276239 0 0 0 0.0343 1 0.0663
#> 2 3.47e-10 45326 1276239 0 0 0 0.0343 1 0.0663
#> 3 3.47e- 9 45326 1249201 0 27038 0 0.0350 1 0.0677
#> 4 7.69e- 9 45326 1207339 0 68900 0 0.0362 1 0.0698
#> 5 5.58e- 8 45326 1192653 0 83586 0 0.0366 1 0.0706
#> 6 6.76e- 8 45326 1080324 0 195915 0 0.0403 1 0.0774
#> 7 7.71e- 8 45326 1074477 0 201762 0 0.0405 1 0.0778
#> 8 3.87e- 7 45326 1029792 0 246447 0 0.0422 1 0.0809
#> 9 5.59e- 7 45326 1027712 0 248527 0 0.0422 1 0.0811
#> 10 6.64e- 7 45326 1007379 0 268860 0 0.0431 1 0.0826
#> # ℹ 92 more rows
#> # ℹ 7 more variables: f2 <dbl>, f0_5 <dbl>, specificity <dbl>, npv <dbl>,
#> # accuracy <dbl>, p4 <dbl>, phi <dbl>errors <- il_errors(model, labels_col = 'ground_truth', threshold = 0.5)
errors[errors$error_type == 'false_positive', ]
#> # A tibble: 43,970 × 6
#> unique_id_l unique_id_r match_weight match_probability true_label error_type
#> <dbl> <dbl> <dbl> <dbl> <lgl> <chr>
#> 1 31596 38845 9.29 0.701 FALSE false_posi…
#> 2 31084 6988 8.24 0.531 FALSE false_posi…
#> 3 33498 20282 9.70 0.756 FALSE false_posi…
#> 4 37520 33251 12.9 0.966 FALSE false_posi…
#> 5 40061 18146 10.3 0.821 FALSE false_posi…
#> 6 42832 26496 10.0 0.794 FALSE false_posi…
#> 7 1304 39792 10.8 0.867 FALSE false_posi…
#> 8 841 4139 11.2 0.896 FALSE false_posi…
#> 9 490 8100 14.0 0.984 FALSE false_posi…
#> 10 353 17115 10.0 0.794 FALSE false_posi…
#> # ℹ 43,960 more rowserrors[errors$error_type == 'false_negative', ]
#> # A tibble: 5,426 × 6
#> unique_id_l unique_id_r match_weight match_probability true_label error_type
#> <dbl> <dbl> <dbl> <dbl> <lgl> <chr>
#> 1 16149 16149 7.72 0.440 TRUE false_nega…
#> 2 42901 42901 8.05 0.497 TRUE false_nega…
#> 3 40135 40135 5.38 0.135 TRUE false_nega…
#> 4 3241 3241 7.01 0.326 TRUE false_nega…
#> 5 5649 5649 5.38 0.135 TRUE false_nega…
#> 6 7205 7205 5.38 0.135 TRUE false_nega…
#> 7 8424 8424 7.90 0.472 TRUE false_nega…
#> 8 12130 12130 2.93 0.0277 TRUE false_nega…
#> 9 20964 20964 5.23 0.123 TRUE false_nega…
#> 10 31274 31274 7.01 0.326 TRUE false_nega…
#> # ℹ 5,416 more rows