--- title: "Linking Banking Transactions" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Linking Banking Transactions} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} can_run <- requireNamespace('nanoparquet', quietly = TRUE) if (can_run) { base_url <- paste0( 'https://raw.githubusercontent.com/', 'moj-analytical-services/splink_datasets/', 'master/data/' ) tmp_ori <- paste0(tempdir(), '/transactions_origin.parquet') tmp_dest <- paste0(tempdir(), '/transactions_destination.parquet') df_origin <- try( { utils::download.file( paste0(base_url, 'transactions_origin.parquet'), tmp_ori, mode = 'wb', quiet = TRUE ) nanoparquet::read_parquet(tmp_ori) }, silent = TRUE ) df_destination <- try( { utils::download.file( paste0(base_url, 'transactions_destination.parquet'), tmp_dest, mode = 'wb', quiet = TRUE ) nanoparquet::read_parquet(tmp_dest) }, silent = TRUE ) can_run <- !inherits(df_origin, 'try-error') && !inherits(df_destination, 'try-error') } knitr::opts_chunk$set( collapse = TRUE, comment = '#>', eval = can_run ) ``` This vignette reproduces the [Splink "Linking banking transactions" demo](https://moj-analytical-services.github.io/splink/demos/examples/duckdb/transactions.html) in `irelink`. It demonstrates two-table linkage with `link_type = "link"` by matching each outgoing payment in the origin table to the corresponding incoming payment in the destination table. The data is synthetic and intentionally challenging. Amounts differ because of fees and exchange-rate effects, dates can shift by a few days, and memos are sometimes truncated. Because each origin payment has exactly one destination counterpart, the prior match probability is `1 / n_origin`. This vignette requires [nanoparquet](https://cran.r-project.org/package=nanoparquet) to read the remote Parquet files, and it only compiles when the package and both data URLs are available. It also assumes DuckDB because the blocking rules below use raw `.where` SQL with DuckDB date helpers such as `strftime()` and `yearweek()`. ## Load the data ```{r load-data} library(irelink) library(ggplot2) df_origin df_destination ``` ## Profile the data ```{r setup-con} con <- DBI::dbConnect(duckdb::duckdb()) ``` ```{r profile} il_profile(df_origin, memo, transaction_date, amount, con = con, top_n = 8) ``` ## Choose blocking rules Because corresponding records differ in predictable ways, the blocking rules need to be broad enough to retain true matches while still shrinking the search space. Fees change amounts, dates shift, and memos are truncated, so the rules below use SQL expressions in `.where` rather than relying on exact agreement alone: ```{r count-pairs} counts <- il_count_pairs( df_origin, df_destination, # Same year-month, similar memo prefix, amount ratio within 30% block_on( .where = paste( "strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')", 'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)', 'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3' ) ), # Same but offset by 15 days to catch month boundaries block_on( .where = paste( "strftime(l.transaction_date + 15, '%Y%m') = strftime(r.transaction_date, '%Y%m')", 'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)', 'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3' ) ), # Memo prefix (first 9 characters) block_on(.where = 'substr(l.memo, 1, 9) = substr(r.memo, 1, 9)'), # Rounded amount + same week block_on( .where = paste( 'round(l.amount / 2, 0) * 2 = round(r.amount / 2, 0) * 2', 'AND yearweek(r.transaction_date) = yearweek(l.transaction_date)' ) ), # Amount offset + week offset block_on( .where = paste( 'round(l.amount / 2, 0) * 2 = round((r.amount + 1) / 2, 0) * 2', 'AND yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)' ) ), # Ground-truth "cheat" rule for completeness block_on(unique_id), con = con, link_type = 'link' ) counts ``` ```{r count-pairs-plot, fig.width = 7, fig.height = 4} autoplot(counts) ``` ## Define the specification The `transaction_date` comparison is one-sided because a payment can only arrive after it is sent. The comparison therefore checks whether `destination_date - origin_date` is between 0 and `N` days: ```{r spec} spec <- il_spec() |> il_compare(amount, cl_pct_diff(0.01, 0.03, 0.10, 0.30)) |> il_compare(memo, cl_levenshtein(2, 6, 10)) |> il_compare( transaction_date, cl_levels( cl_null(), cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 1'), cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 4'), cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 10'), cl_custom('(r.{col} - l.{col}) BETWEEN 0 AND 30'), cl_else() ) ) |> il_block_on( .where = paste( "strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')", 'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)', 'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3' ) ) |> il_block_on( .where = paste( "strftime(l.transaction_date + 15, '%Y%m') = strftime(r.transaction_date, '%Y%m')", 'AND substr(l.memo, 1, 3) = substr(r.memo, 1, 3)', 'AND l.amount / r.amount > 0.7 AND l.amount / r.amount < 1.3' ) ) |> il_block_on(.where = 'substr(l.memo, 1, 9) = substr(r.memo, 1, 9)') |> il_block_on( .where = paste( 'round(l.amount / 2, 0) * 2 = round(r.amount / 2, 0) * 2', 'AND yearweek(r.transaction_date) = yearweek(l.transaction_date)' ) ) |> il_block_on( .where = paste( 'round(l.amount / 2, 0) * 2 = round((r.amount + 1) / 2, 0) * 2', 'AND yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)' ) ) |> il_block_on(unique_id) spec ``` ## Train the model Because this benchmark is one-to-one, set the prevalence prior directly with `il_prior_prevalence()` instead of changing `model$params` by hand: ```{r model} model <- il_model( df_origin, df_destination, spec = spec, con = con, link_type = 'link' ) model <- il_prior_prevalence(model, 1 / nrow(df_origin)) model <- il_estimate_u(model, max_pairs = 1e6) |> il_estimate_em(block_on(memo)) |> il_estimate_em(block_on(amount)) ``` ## Inspect the trained model ```{r summary} summary(model) ``` ```{r weights-plot, fig.width = 7, fig.height = 4} autoplot(model) ``` ```{r params-plot, fig.width = 7, fig.height = 5} autoplot(model, type = 'parameters') ``` ## Predict ```{r predict} predictions <- predict(model, threshold = 0.001) predictions ``` ```{r histogram, fig.width = 7, fig.height = 3.5} autoplot(predictions) ``` ```{r waterfall, fig.width = 7, fig.height = 4} autoplot(predictions, which = 1) ``` ## Evaluate against ground truth ```{r accuracy} acc <- il_accuracy(model, labels_col = 'ground_truth') acc ``` ```{r accuracy-plot, fig.width = 7, fig.height = 4} autoplot(acc) ``` ### Error inspection ```{r errors-fp} errors <- il_errors(model, labels_col = 'ground_truth', threshold = 0.5) errors[errors$error_type == 'false_positive', ] ``` ```{r errors-fn} errors[errors$error_type == 'false_negative', ] ``` ## Cleanup ```{r cleanup} il_cleanup(model) DBI::dbDisconnect(con, shutdown = TRUE) ``` `il_cleanup(model)` only removes tables owned by that model. If an interactive run fails before you keep the model object, call `il_cleanup_all(con)` to remove all `irelink` tables from the connection before disconnecting.