Differential privacy is a rigorous mathematical framework for protecting individual privacy while sharing aggregate information about a dataset. It provides formal guarantees that the presence or absence of any single individual in a dataset has minimal impact on the results of statistical queries.
Privacy Parameters:
Privacy Mechanisms:
The most straightforward use case is adding differentially private noise to columns in a data frame:
# Create sample data
employee_data <- data.frame(
name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
age = c(28, 35, 42, 31, 38),
salary = c(65000, 75000, 85000, 70000, 80000)
)
# View original data
head(employee_data)
#> name age salary
#> 1 Alice 28 65000
#> 2 Bob 35 75000
#> 3 Charlie 42 85000
#> 4 Diana 31 70000
#> 5 Eve 38 80000
# Add differential privacy noise
private_data <- employee_data %>%
dp_add_noise(
columns = c("age", "salary"),
epsilon = 0.5,
lower = c(age = 22, salary = 50000),
upper = c(age = 65, salary = 150000)
)
# View privatized data
head(private_data)
#> name age salary
#> 1 Alice -19.56794 -414130.96
#> 2 Bob 108.91375 86570.53
#> 3 Charlie 24.71835 392272.89
#> 4 Diana 155.92213 91710.91
#> 5 Eve 221.01506 61846.38Notice how the numeric columns now have noise added while preserving the names column.
Count the number of records with privacy guarantees:
# Create sample data
city_data <- data.frame(
city = rep(c("New York", "Los Angeles", "Chicago"), c(150, 120, 80)),
category = sample(c("A", "B", "C"), 350, replace = TRUE)
)
# Overall count
overall_count <- city_data %>%
dp_count(epsilon = 0.1)
print(overall_count)
#> count
#> 1 343
# Grouped count by city
city_counts <- city_data %>%
dp_count(epsilon = 0.1, group_by = "city")
print(city_counts)
#> city count
#> 1 Chicago 95
#> 2 Los Angeles 130
#> 3 New York 157
# Count by multiple groups
city_category_counts <- city_data %>%
dp_count(epsilon = 0.1, group_by = c("city", "category"))
head(city_category_counts)
#> city category count
#> 1 Chicago A 15
#> 2 Los Angeles A 43
#> 3 New York A 74
#> 4 Chicago B 26
#> 5 Los Angeles B 35
#> 6 New York B 42Compute private averages:
# Create sample data
income_data <- data.frame(
region = rep(c("North", "South", "East", "West"), each = 100),
income = c(
rnorm(100, mean = 60000, sd = 15000),
rnorm(100, mean = 55000, sd = 12000),
rnorm(100, mean = 65000, sd = 18000),
rnorm(100, mean = 58000, sd = 14000)
)
)
# Overall mean income
avg_income <- income_data %>%
dp_mean(
"income",
epsilon = 0.2,
lower = 20000,
upper = 150000
)
print(avg_income)
#> income_mean
#> 1 61648.21
# Mean by region
regional_avg <- income_data %>%
dp_mean(
"income",
epsilon = 0.2,
lower = 20000,
upper = 150000,
group_by = "region"
)
print(regional_avg)
#> region income_mean
#> 1 East 51274.24
#> 2 North 78910.84
#> 3 South 33916.84
#> 4 West 61293.99Compute private totals:
# Create sales data
sales_data <- data.frame(
store = rep(c("Store A", "Store B", "Store C"), each = 50),
sales = c(
rpois(50, lambda = 1000),
rpois(50, lambda = 1200),
rpois(50, lambda = 900)
)
)
# Total sales by store
store_totals <- sales_data %>%
dp_sum(
"sales",
epsilon = 0.3,
lower = 0,
upper = 5000,
group_by = "store"
)
print(store_totals)
#> store sales_sum
#> 1 Store A 74639.31
#> 2 Store B 28654.55
#> 3 Store C 58905.43When performing multiple queries on the same dataset, you need to track your total privacy expenditure using a privacy budget:
# Create a privacy budget
budget <- new_privacy_budget(
epsilon_total = 1.0,
delta_total = 1e-5
)
print(budget)
#> Privacy Budget
#> ==============
#> Total: epsilon = 1.0000, delta = 1.00e-05
#> Spent: epsilon = 0.0000, delta = 0.00e+00
#> Remaining: epsilon = 1.0000, delta = 1.00e-05
#> Composition: basic
#> Operations executed: 0
# Perform first query
result1 <- city_data %>%
dp_count(epsilon = 0.3, .budget = budget)
print(budget)
#> Privacy Budget
#> ==============
#> Total: epsilon = 1.0000, delta = 1.00e-05
#> Spent: epsilon = 0.0000, delta = 0.00e+00
#> Remaining: epsilon = 1.0000, delta = 1.00e-05
#> Composition: basic
#> Operations executed: 0
# Perform second query
result2 <- city_data %>%
dp_count(epsilon = 0.4, group_by = "city", .budget = budget)
print(budget)
#> Privacy Budget
#> ==============
#> Total: epsilon = 1.0000, delta = 1.00e-05
#> Spent: epsilon = 0.0000, delta = 0.00e+00
#> Remaining: epsilon = 1.0000, delta = 1.00e-05
#> Composition: basic
#> Operations executed: 0
# Check if we have enough budget for another query
can_query <- check_privacy_budget(budget, epsilon_required = 0.5)
print(paste("Can perform query with epsilon=0.5?", can_query))
#> [1] "Can perform query with epsilon=0.5? TRUE"
# We only have 0.3 epsilon remaining
can_query <- check_privacy_budget(budget, epsilon_required = 0.2)
print(paste("Can perform query with epsilon=0.2?", can_query))
#> [1] "Can perform query with epsilon=0.2? TRUE"The tidydp package uses basic composition by default, where the total privacy cost is the sum of individual query costs:
\[\epsilon_{total} = \sum_{i=1}^{k} \epsilon_i\]
This is a conservative approach that ensures strong privacy guarantees.
| Epsilon Value | Privacy Level | Use Case |
|---|---|---|
| 0.01 - 0.1 | Very Strong | Highly sensitive medical or financial data |
| 0.1 - 1.0 | Strong | Personal information, general sensitive data |
| 1.0 - 5.0 | Moderate | Less sensitive aggregate statistics |
| 5.0+ | Weak | Public or minimally sensitive data |
Providing accurate bounds is crucial for utility:
# Example: Impact of bounds on utility
test_data <- data.frame(age = c(25, 30, 35, 40, 45))
# Tight bounds (accurate)
tight_bounds <- test_data %>%
dp_add_noise(
columns = "age",
epsilon = 0.5,
lower = c(age = 20),
upper = c(age = 50)
)
# Loose bounds (less accurate)
loose_bounds <- test_data %>%
dp_add_noise(
columns = "age",
epsilon = 0.5,
lower = c(age = 0),
upper = c(age = 100)
)
# Compare results
data.frame(
Original = test_data$age,
Tight_Bounds = round(tight_bounds$age, 1),
Loose_Bounds = round(loose_bounds$age, 1)
)
#> Original Tight_Bounds Loose_Bounds
#> 1 25 75.7 -86.4
#> 2 30 16.6 80.6
#> 3 35 157.7 60.0
#> 4 40 25.2 -465.3
#> 5 45 35.1 179.9Tighter bounds lead to better utility (less noise) while maintaining the same privacy guarantee.
Use Laplace (default): - When you need pure ε-differential privacy (δ = 0) - For counting queries - When δ > 0 is not acceptable
Use Gaussian: - When (ε, δ)-differential privacy is acceptable - Often provides better utility for the same privacy level - When working with continuous data and aggregates
# Compare mechanisms
test_values <- data.frame(value = c(100, 200, 300, 400, 500))
# Laplace mechanism
laplace_result <- test_values %>%
dp_add_noise(
columns = "value",
epsilon = 0.5,
lower = c(value = 0),
upper = c(value = 1000),
mechanism = "laplace"
)
# Gaussian mechanism
gaussian_result <- test_values %>%
dp_add_noise(
columns = "value",
epsilon = 0.5,
delta = 1e-5,
lower = c(value = 0),
upper = c(value = 1000),
mechanism = "gaussian"
)
data.frame(
Original = test_values$value,
Laplace = round(laplace_result$value, 1),
Gaussian = round(gaussian_result$value, 1)
)
#> Original Laplace Gaussian
#> 1 100 -1668.7 -5407.2
#> 2 200 950.8 1149.2
#> 3 300 391.7 -3991.6
#> 4 400 1020.9 -17116.3
#> 5 500 1036.1 2874.8Here’s a complete example analyzing employee data while maintaining differential privacy:
# Create employee dataset
employees <- data.frame(
department = rep(c("Engineering", "Sales", "Marketing", "HR"), each = 25),
salary = c(
rnorm(25, 85000, 15000), # Engineering
rnorm(25, 70000, 12000), # Sales
rnorm(25, 65000, 10000), # Marketing
rnorm(25, 60000, 8000) # HR
),
years_experience = c(
rpois(25, 5),
rpois(25, 4),
rpois(25, 3),
rpois(25, 4)
)
)
# Ensure realistic bounds
employees$salary <- pmax(40000, pmin(150000, employees$salary))
employees$years_experience <- pmax(0, pmin(20, employees$years_experience))
# Initialize privacy budget
analysis_budget <- new_privacy_budget(epsilon_total = 2.0)
# Query 1: Count by department (epsilon = 0.5)
dept_counts <- employees %>%
dp_count(
epsilon = 0.5,
group_by = "department",
.budget = analysis_budget
)
cat("\nEmployee counts by department:\n")
#>
#> Employee counts by department:
print(dept_counts)
#> department count
#> 1 Engineering 24
#> 2 HR 25
#> 3 Marketing 23
#> 4 Sales 28
# Query 2: Average salary by department (epsilon = 0.8)
dept_salaries <- employees %>%
dp_mean(
"salary",
epsilon = 0.8,
lower = 40000,
upper = 150000,
group_by = "department",
.budget = analysis_budget
)
cat("\nAverage salaries by department:\n")
#>
#> Average salaries by department:
print(dept_salaries)
#> department salary_mean
#> 1 Engineering 81784.28
#> 2 HR 50731.66
#> 3 Marketing 74232.57
#> 4 Sales 67084.91
# Query 3: Average experience (epsilon = 0.4)
avg_experience <- employees %>%
dp_mean(
"years_experience",
epsilon = 0.4,
lower = 0,
upper = 20,
.budget = analysis_budget
)
cat("\nAverage years of experience:\n")
#>
#> Average years of experience:
print(avg_experience)
#> years_experience_mean
#> 1 4.210764
# Check remaining budget
cat("\nFinal budget status:\n")
#>
#> Final budget status:
print(analysis_budget)
#> Privacy Budget
#> ==============
#> Total: epsilon = 2.0000, delta = 1.00e-05
#> Spent: epsilon = 0.0000, delta = 0.00e+00
#> Remaining: epsilon = 2.0000, delta = 1.00e-05
#> Composition: basic
#> Operations executed: 0new_privacy_budget() for multiple queriesDon’t run the same query multiple times without accounting for cumulative privacy loss:
Not providing bounds forces the algorithm to use data-dependent bounds, which can leak information:
Using epsilon > 10 provides minimal privacy protection:
# Very weak privacy
weak_privacy <- test_values %>%
dp_add_noise(
columns = "value",
epsilon = 50, # Too large!
lower = c(value = 0),
upper = c(value = 1000)
)
# The noise is minimal
data.frame(
Original = test_values$value,
With_Noise = round(weak_privacy$value, 1),
Difference = round(abs(test_values$value - weak_privacy$value), 1)
)
#> Original With_Noise Difference
#> 1 100 95.6 4.4
#> 2 200 197.8 2.2
#> 3 300 277.2 22.8
#> 4 400 375.2 24.8
#> 5 500 475.8 24.2If you encounter issues or have questions:
?tidydp,
?dp_add_noise, ?dp_count, etc.example(dp_add_noise)