This vignette reproduces the Splink
“Deduplicate 50k synthetic” demo in irelink. The data
is based on historical people scraped from Wikidata and includes
duplicate records with realistic errors such as typos, missing values,
and swapped fields. The cluster column provides the
ground-truth entity labels used in evaluation.
This vignette requires nanoparquet to read the remote Parquet file and only compiles when the package and the data URL are both available.
library(irelink)
library(ggplot2)
df
#> # A data frame: 50,578 × 11
#> unique_id cluster full_name first_and_surname first_name surname dob
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Q2296770-1 Q2296770 thomas cliff… thomas chudleigh thomas chudle… 1630…
#> 2 Q2296770-2 Q2296770 thomas of ch… thomas chudleigh thomas chudle… 1630…
#> 3 Q2296770-3 Q2296770 tom 1st baro… tom chudleigh tom chudle… 1630…
#> 4 Q2296770-4 Q2296770 thomas 1st c… thomas chudleigh thomas chudle… 1630…
#> 5 Q2296770-5 Q2296770 thomas cliff… thomas chudleigh thomas chudle… 1630…
#> 6 Q2296770-6 Q2296770 thomas cliff… thomas chudleigh thomas chudle… 1630…
#> 7 Q2296770-7 Q2296770 tom baron ch… tom chudleigh tom chudle… 1630…
#> 8 Q2296770-8 Q2296770 tom clifford… tom chudleigh tom chudle… <NA>
#> 9 Q2296770-9 Q2296770 thomas cliff… thomas chudleigh thomas chudle… 1630…
#> 10 Q2296770-10 Q2296770 thomas cliff… thomas chudleigh thomas chudle… <NA>
#> # ℹ 50,568 more rows
#> # ℹ 4 more variables: birth_place <chr>, postcode_fake <chr>, gender <chr>,
#> # occupation <chr>Use completeness and value distributions to choose blocking rules and comparisons:
il_profile(df, first_name, surname, dob, birth_place, con = con, top_n = 8)
#> # A tibble: 32 × 3
#> column value n
#> <chr> <chr> <dbl>
#> 1 first_name william 2780
#> 2 first_name john 2736
#> 3 first_name thomas 1448
#> 4 first_name george 1415
#> 5 first_name henry 1306
#> 6 first_name james 1265
#> 7 first_name sir 1262
#> 8 first_name charles 1216
#> 9 surname <NA> 4515
#> 10 surname baronet 615
#> # ℹ 22 more rowsil_suggest_blocking(df, con = con)
#> # A tibble: 10 × 6
#> rule n_distinct coverage n_pairs pct_of_cartesian score
#> <chr> <int> <dbl> <int> <dbl> <dbl>
#> 1 cluster 5156 1 303961 0.0238 1.000
#> 2 full_name 25573 0.999 87973 0.0069 0.999
#> 3 first_and_surname 20479 0.999 262393 0.0205 0.998
#> 4 first_name 4413 0.999 16372982 1.28 0.986
#> 5 surname 6195 0.911 733085 0.0573 0.910
#> 6 birth_place 2373 0.863 4923790 0.385 0.860
#> 7 postcode_fake 12363 0.774 112172 0.0088 0.774
#> 8 dob 8985 0.774 1549081 0.121 0.774
#> 9 occupation 453 0.5 12573944 0.983 0.495
#> 10 gender 7 0.778 561648436 43.9 0.436The cumulative_pairs column shows the total number of
unique pairs produced so far:
il_count_pairs(
df,
block_on(surname, dob),
block_on(first_name, dob),
block_on(first_name, surname),
block_on(dob, birth_place),
con = con
)
#> # A tibble: 4 × 4
#> rule n_pairs cumulative_pairs pct_of_cartesian
#> <chr> <dbl> <dbl> <dbl>
#> 1 surname & dob 62893 62893 0.0049
#> 2 first_name & dob 67757 92798 0.0073
#> 3 first_name & surname 243656 298602 0.0233
#> 4 dob & birth_place 66657 314273 0.0246Apply term-frequency adjustment to birth_place and
occupation so common values such as “London” receive less
weight than rare ones:
spec <- il_spec() |>
il_compare(first_name, cl_name()) |>
il_compare(surname, cl_name()) |>
il_compare(dob, cl_dob()) |>
il_compare(postcode_fake, cl_postcode()) |>
il_compare(birth_place, cl_exact(term_frequency = TRUE)) |>
il_compare(occupation, cl_exact(term_frequency = TRUE)) |>
il_block_on(first_name ~ il_substr(1, 3), surname ~ il_substr(1, 4)) |>
il_block_on(surname, dob) |>
il_block_on(first_name, dob) |>
il_block_on(postcode_fake, first_name) |>
il_block_on(postcode_fake, surname) |>
il_block_on(dob, birth_place) |>
il_block_on(postcode_fake ~ il_substr(1, 3), dob) |>
il_block_on(postcode_fake ~ il_substr(1, 3), first_name) |>
il_block_on(postcode_fake ~ il_substr(1, 3), surname) |>
il_block_on(
first_name ~ il_substr(1, 2),
surname ~ il_substr(1, 2),
dob ~ il_substr(1, 4)
)
spec
#> Linkage Specification
#> Comparisons (6):
#> first_name : levels
#> surname : levels
#> dob : levels
#> postcode_fake : levels
#> birth_place : exact
#> occupation : exact
#> Blocking rules (10, OR-ed):
#> 1. first_name [il_substr(1,3)], surname [il_substr(1,4)]
#> 2. surname, dob
#> 3. first_name, dob
#> 4. postcode_fake, first_name
#> 5. postcode_fake, surname
#> 6. dob, birth_place
#> 7. postcode_fake [il_substr(1,3)], dob
#> 8. postcode_fake [il_substr(1,3)], first_name
#> 9. postcode_fake [il_substr(1,3)], surname
#> 10. first_name [il_substr(1,2)], surname [il_substr(1,2)], dob [il_substr(1,4)]model <- df |>
il_model(spec = spec, con = con) |>
il_estimate_prior(
block_on(first_name, surname, dob),
block_on(dob, postcode_fake),
recall = 0.6
) |>
il_estimate_u(max_pairs = 5e6) |>
il_estimate_em(block_on(first_name, surname)) |>
il_estimate_em(block_on(dob))
#> EM trained: dob, postcode_fake, birth_place, and occupation | skipped (blocked
#> on): first_name and surname
#> EM trained: first_name, surname, postcode_fake, birth_place, and occupation |
#> skipped (blocked on): dobsummary(model)
#> irelink Model
#> Status: Trained
#> Link type: dedupe
#> Records: 50578
#> Comparisons: 6
#> Blocking rules: 10
#>
#> Parameters:
#> prior: 0.0003629405
#> comparisons: # A tibble: 25 × 4
#> comparisons: comparison gamma_level m u
#> comparisons: <chr> <int> <dbl> <dbl>
#> comparisons: 1 first_name 0 0.177 0.963
#> comparisons: 2 first_name 1 0.123 0.0180
#> comparisons: 3 first_name 2 0.0772 0.00288
#> comparisons: 4 first_name 3 0.0628 0.00127
#> comparisons: 5 first_name 4 0.560 0.0152
#> comparisons: 6 surname 0 0.0572 0.981
#> comparisons: 7 surname 1 0.0224 0.0168
#> comparisons: 8 surname 2 0.0397 0.000430
#> comparisons: 9 surname 3 0.0951 0.000285
#> comparisons: 10 surname 4 0.785 0.00120
#> comparisons: # ℹ 15 more rows
#> u_estimation: 5e+06
#> u_estimation: FALSE
#> u_estimation: NULL
#> u_estimation: NULL
#> u_estimation: 5000000
#> u_estimation: 1predictions <- predict(model, threshold = 0.5)
predictions
#> # A tibble: 220,801 × 13
#> unique_id_l unique_id_r gamma_first_name gamma_surname gamma_dob
#> * <chr> <chr> <int> <int> <int>
#> 1 Q8016046-2 Q8016046-20 1 2 5
#> 2 Q5763989-1 Q5763989-11 0 -1 5
#> 3 Q3182566-4 Q3182566-8 0 -1 5
#> 4 Q446382-5 Q446382-7 2 0 5
#> 5 Q5341712-5 Q5341712-8 0 -1 5
#> 6 Q722973-1 Q722973-8 0 -1 5
#> 7 Q21464647-3 Q21464647-6 2 1 5
#> 8 Q5763989-11 Q5763989-8 0 -1 5
#> 9 Q59661796-10 Q59661796-6 0 -1 5
#> 10 Q8001748-1 Q8001748-6 0 3 5
#> # ℹ 220,791 more rows
#> # ℹ 8 more variables: gamma_postcode_fake <int>, gamma_birth_place <int>,
#> # gamma_occupation <int>, match_weight <dbl>, tf_adj_birth_place <dbl>,
#> # tf_adj_occupation <dbl>, total_match_weight <dbl>, match_probability <dbl>clusters <- il_cluster(predictions, threshold = 0.95)
clusters
#> # A tibble: 46,151 × 2
#> unique_id cluster_id
#> <chr> <chr>
#> 1 Q55218986-6 cluster_Q55218986-1
#> 2 Q6224863-7 cluster_Q6224863-1
#> 3 Q18009221-8 cluster_Q18009221-1
#> 4 Q15996859-6 cluster_Q15996859-1
#> 5 Q98931469-3 cluster_Q98931469-1
#> 6 Q4800763-4 cluster_Q4800763-1
#> 7 Q16861527-11 cluster_Q16861527-1
#> 8 Q81638842-10 cluster_Q81638842-1
#> 9 Q5725848-1 cluster_Q5725848-1
#> 10 Q56736646-3 cluster_Q56736646-1
#> # ℹ 46,141 more rowsacc <- il_accuracy(model, labels_col = 'cluster')
acc
#> # A tibble: 3,524 × 16
#> threshold tp fp fn tn fn_blocking_miss precision recall f1
#> <dbl> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 0 203544 247197 100417 0 100417 0.452 0.670 0.539
#> 2 4.20e-9 203544 247197 100417 0 100417 0.452 0.670 0.539
#> 3 2.52e-8 203544 247157 100417 40 100417 0.452 0.670 0.539
#> 4 3.42e-8 203544 247114 100417 83 100417 0.452 0.670 0.539
#> 5 7.21e-8 203544 246732 100417 465 100417 0.452 0.670 0.540
#> 6 9.65e-8 203544 246732 100417 465 100417 0.452 0.670 0.540
#> 7 1.19e-7 203544 246683 100417 514 100417 0.452 0.670 0.540
#> 8 1.31e-7 203544 246682 100417 515 100417 0.452 0.670 0.540
#> 9 1.32e-7 203544 246682 100417 515 100417 0.452 0.670 0.540
#> 10 1.56e-7 203544 246682 100417 515 100417 0.452 0.670 0.540
#> # ℹ 3,514 more rows
#> # ℹ 7 more variables: f2 <dbl>, f0_5 <dbl>, specificity <dbl>, npv <dbl>,
#> # accuracy <dbl>, p4 <dbl>, phi <dbl>When you use labels_col, the evaluation derives all true
duplicate pairs from the ground-truth cluster column. Some true pairs
may never be generated by the blocking rules. Those pairs count as false
negatives at every threshold. As a result, the maximum recall in the
accuracy, ROC, and precision-recall plots is the blocking recall:
errors <- il_errors(model, labels_col = 'cluster', threshold = 0.999)
errors[errors$error_type == 'false_positive', ]
#> # A tibble: 128 × 6
#> unique_id_l unique_id_r match_weight match_probability true_label error_type
#> <chr> <chr> <dbl> <dbl> <lgl> <chr>
#> 1 Q15176618-6 Q8005070-2 22.4 1.000 FALSE false_posi…
#> 2 Q3568485-2 Q3568487-2 38.9 1.000 FALSE false_posi…
#> 3 Q15176618-8 Q8005070-4 22.8 1.000 FALSE false_posi…
#> 4 Q17627000-2 Q24845632-1 21.5 0.999 FALSE false_posi…
#> 5 Q3568485-5 Q3568487-2 39.4 1.000 FALSE false_posi…
#> 6 Q4088004-5 Q545038-3 22.6 1.000 FALSE false_posi…
#> 7 Q7528045-6 Q7528085-5 26.0 1.000 FALSE false_posi…
#> 8 Q1512-10 Q325068-2 22.6 1.000 FALSE false_posi…
#> 9 Q7528045-3 Q7528085-7 23.4 1.000 FALSE false_posi…
#> 10 Q5583974-2 Q6130041-1 23.7 1.000 FALSE false_posi…
#> # ℹ 118 more rowsSome false negatives occur because the true pair was never generated by any blocking rule:
errors <- il_errors(model, labels_col = 'cluster', threshold = 0.5)
errors[errors$error_type == 'false_negative', ]
#> # A tibble: 112,562 × 6
#> unique_id_l unique_id_r match_weight match_probability true_label error_type
#> <chr> <chr> <dbl> <dbl> <lgl> <chr>
#> 1 Q4799003-15 Q4799003-18 10.7 0.372 TRUE false_neg…
#> 2 Q5392750-16 Q5392750-4 7.09 0.0473 TRUE false_neg…
#> 3 Q5541822-14 Q5541822-7 7.71 0.0708 TRUE false_neg…
#> 4 Q4020180-3 Q4020180-4 8.97 0.154 TRUE false_neg…
#> 5 Q1721559-13 Q1721559-15 8.97 0.154 TRUE false_neg…
#> 6 Q20734050-13 Q20734050-5 9.68 0.229 TRUE false_neg…
#> 7 Q4529867-10 Q4529867-14 8.76 0.136 TRUE false_neg…
#> 8 Q16198727-14 Q16198727-8 11.3 0.485 TRUE false_neg…
#> 9 Q2734255-5 Q2734255-9 10.5 0.349 TRUE false_neg…
#> 10 Q19654778-13 Q19654778-3 5.87 0.0208 TRUE false_neg…
#> # ℹ 112,552 more rows