This repository hosts the code for the MLBC package, implementing bias corrction methods described in Battaglia, Christensen, Hansen & Sacher (2024). A sample application of this package can be found in the file ex1.Rmd.
## Getting Started This package can be installed from CRAN by typing
> install.packages("MLBC")
into the R console. The core
functions of the package are:
## ols_bca This procedure first computes the standard OLS estimator on a design matrix (Xhat), the first column of which contains AI/ML-generated binary labels, and then applies an additive correction based on an estimate (fpr) of the false-positive rate computed externally. The method also adjusts the variance estimator with a finite-sample correction term to account for the uncertainty in the bias estimation.
Parameters
----------
Y : array_like, shape (n,)
Response variable vector.
Xhat : array_like, shape (n, d)
Design matrix, the first column of which contains the AI/ML-generated binary covariates.
fpr : float
False positive rate of misclassification, used to correct the OLS estimates.
m : int or float
Size of the external sample used to estimate the classifier's false-positive rate. Can be set to 'inf' when the false-positive rate is known exactly.
Returns
-------
b : ndarray, shape (d,)
Bias-corrected regression coefficient estimates.
V : ndarray, shape (d, d)
Adjusted variance-covariance matrix for the bias-corrected estimator.
## one_step
This method jointly estimates the upstream (measurement) and downstream (regression) models using only the unlabeled likelihood. Leveraging JAX for automatic differentiation and optimization, it minimizes the negative log-likelihood to obtain the regression coefficients. The variance is then approximated via the inverse Hessian at the optimum.
Parameters
----------
Y : array_like, shape (n,)
Response variable vector.
Xhat : array_like, shape (n, d)
Design matrix constructed from AI/ML-generated regressors.
homoskedastic : bool, optional (default: False)
If True, assumes a common error variance; otherwise, separate error variances are estimated.
distribution : allows to specify the distribution of error terms, optional. By default, it's Normal(0,1).
A custom distribution can be passed down as any jax-compatible PDF function that takes inputs (x, loc, scale).
Returns
-------
b : ndarray, shape (d,)
Estimated regression coefficients extracted from the optimized parameter vector.
V : ndarray, shape (d, d)
Estimated variance-covariance matrix for the regression coefficients, computed as the inverse
of the Hessian of the objective function.
library(MLBC)
<- 1000
nsim <- 16000
n <- 1000
m <- 0.05
p <- 1
kappa <- kappa / sqrt(n)
fpr
<- 10
b0 <- 1
b1 <- 0.3
a0 <- 0.5
a1
<- c(0.0, 0.5, 0.5)
alpha <- c(0.0, 2.0, 4.0)
beta
#pre-allocated storage
<- array(0, dim = c(nsim, 9, 2))
B <- array(0, dim = c(nsim, 9, 2))
S
<- function(b, V, i, method_idx) {
update_results for (j in 1:2) {
<<- b[j]
B[i, method_idx, j] <<- sqrt(max(V[j,j], 0))
S[i, method_idx, j]
} }
<- function(n, m, p, fpr, b0, b1, a0, a1) {
generate_data <- n + m
N <- numeric(N)
X <- numeric(N)
Xhat <- runif(N)
u
for (j in seq_len(N)) {
if (u[j] <= fpr) X[j] <- 1
else if (u[j] <= 2*fpr) Xhat[j]<- 1
else if (u[j] <= p + fpr) { # true positive
<- 1
X[j] <- 1
Xhat[j]
}
}
<- rnorm(N) # N(0,1)
eps # heteroskedastic noise: σ₁ when X=1, σ₀ when X=0
<- b0 + b1*X + (a1*X + a0*(1 - X)) * eps
Y
# split into train vs test
<- Y[ 1:n ]
train_Y <- cbind(Xhat[ 1:n], rep(1, n))
train_X <- Y[(n+1):N ]
test_Y <- cbind(Xhat[(n+1):N], rep(1, m))
test_Xhat <- cbind(X[(n+1):N], rep(1, m))
test_X
list(
train_Y = train_Y,
train_X = train_X,
test_Y = test_Y,
test_Xhat = test_Xhat,
test_X = test_X
) }
for (i in seq_len(nsim)) {
<- generate_data(n, m, p, fpr, b0, b1, a0, a1)
dat <- dat$train_Y; tX <- dat$train_X
tY <- dat$test_Y; eXhat <- dat$test_Xhat; eX <- dat$test_X
eY
# 1) OLS on unlabeled (X̂)
<- ols(tY, tX)
ols_res update_results(ols_res$coef, ols_res$vcov, i, 1)
# 2) OLS on labeled (true X)
<- ols(eY, eX)
ols_res update_results(ols_res$coef, ols_res$vcov, i, 2)
# 3–8) Additive & multiplicative bias‑corrections
<- mean(eXhat[,1] * (1 - eX[,1]))
fpr_hat for (j in 1:3) {
<- (fpr_hat * m + alpha[j]) / (m + alpha[j] + beta[j])
fpr_bayes
<- ols_bca(tY, tX, fpr_bayes, m)
bca_res update_results(bca_res$coef, bca_res$vcov, i, 2 + j)
<- ols_bcm(tY, tX, fpr_bayes, m)
bcm_res update_results(bcm_res$coef, bcm_res$vcov, i, 5 + j)
}
# 9) One‑step estimator
<- one_step(tY, tX)
one_res update_results(one_res$coef, one_res$cov, i, 9)
if (i %% 100 == 0) {
message("Completed ", i, " of ", nsim, " sims")
} }
## Completed 100 of 1000 sims
## Completed 200 of 1000 sims
## Completed 300 of 1000 sims
## Completed 400 of 1000 sims
## Completed 500 of 1000 sims
## Completed 600 of 1000 sims
## Completed 700 of 1000 sims
## Completed 800 of 1000 sims
## Completed 900 of 1000 sims
## Completed 1000 of 1000 sims
<- function(bgrid, b, se) {
coverage <- length(bgrid)
n_grid <- numeric(n_grid)
cvg for (i in seq_along(bgrid)) {
<- bgrid[i]
val <- mean(abs(b - val) <= 1.96 * se)
cvg[i]
}
cvg
}
<- 1.0
true_beta1
<- c(
methods "OLS ĥ" = 1,
"OLS θ̂" = 2,
"BCA-0" = 3,
"BCA-1" = 4,
"BCA-2" = 5,
"BCM-0" = 6,
"BCM-1" = 7,
"BCM-2" = 8,
"OSU" = 9
)
<- sapply(methods, function(col) {
cov_dict <- B[, col, 1]
slopes <- S[, col, 1]
ses mean(abs(slopes - true_beta1) <= 1.96 * ses)
})
<- setNames(cov_dict, names(methods))
cov_series print(cov_series)
## OLS ĥ OLS θ̂ BCA-0 BCA-1 BCA-2 BCM-0 BCM-1 BCM-2 OSU
## 0.000 0.952 0.841 0.890 0.888 0.887 0.912 0.911 0.951
<- names(methods)
method_names
<- c("Beta1","Beta0")
coef_names
<- dim(B)[2]
nmethods <- data.frame(Method = method_names, stringsAsFactors = FALSE)
df
$Est_Beta1 <- NA_real_
df$SE_Beta1 <- NA_real_
df$CI95_Beta1 <- NA_character_
df$Est_Beta0 <- NA_real_
df$SE_Beta0 <- NA_real_
df$CI95_Beta0 <- NA_character_
df
for(i in seq_len(nmethods)) {
<- B[, i, 1]; se1 <- S[, i, 1]
est1 <- B[, i, 2]; se0 <- S[, i, 2]
est0
<- quantile(est1, probs = c(0.025, 0.975))
ci1 <- quantile(est0, probs = c(0.025, 0.975))
ci0
$Est_Beta1[i] <- mean(est1)
df$SE_Beta1[i] <- mean(se1)
df$CI95_Beta1[i] <- sprintf("[%0.3f, %0.3f]", ci1[1], ci1[2])
df
$Est_Beta0[i] <- mean(est0)
df$SE_Beta0[i] <- mean(se0)
df$CI95_Beta0[i] <- sprintf("[%0.3f, %0.3f]", ci0[1], ci0[2])
df
}
print(df)
## Method Est_Beta1 SE_Beta1 CI95_Beta1 Est_Beta0 SE_Beta0
## 1 OLS ĥ 0.8345230 0.02126622 [0.793, 0.876] 10.008229 0.002559755
## 2 OLS θ̂ 0.9979584 0.07110668 [0.862, 1.139] 9.999644 0.009712562
## 3 BCA-0 0.9733887 0.05192212 [0.879, 1.097] 10.001295 0.003527102
## 4 BCA-1 0.9818128 0.05328685 [0.888, 1.105] 10.000874 0.003578399
## 5 BCA-2 0.9815196 0.05324235 [0.888, 1.105] 10.000889 0.003576679
## 6 BCM-0 1.0064253 0.06465110 [0.886, 1.197] 9.999649 0.003963054
## 7 BCM-1 1.0188792 0.06713945 [0.896, 1.213] 9.999027 0.004060987
## 8 BCM-2 1.0184172 0.06705113 [0.896, 1.212] 9.999050 0.004057390
## 9 OSU 0.9985015 0.03102350 [0.934, 1.055] 9.999928 0.002500163
## CI95_Beta0
## 1 [10.003, 10.013]
## 2 [9.981, 10.018]
## 3 [9.994, 10.007]
## 4 [9.994, 10.007]
## 5 [9.994, 10.007]
## 6 [9.990, 10.007]
## 7 [9.989, 10.007]
## 8 [9.989, 10.007]
## 9 [9.995, 10.005]