ageutils provides a collection of functions for working with age intervals whose underlying implementations have been optimised for performance.
breaks_to_interval()
breaks_to_interval
provides a categorisation based on specified breaks which
represent left-hand interval limits. The resultant groupings span from the
minimum break through to a specified max_upper
and will always be closed on
the left and open on the right. As an example, if breaks = c(0, 1, 10, 30)
the
interval categories would be [0, 1), [1, 10), [10, 30) and [30, Inf). Ages above
max_upper
will be returned as NA.
The returned value is as a data frame with 3 entries; A factor with a character representation of the interval and two columns representing the numeric values of the corresponding lower (closed) and upper (open) bounds.
library(ageutils)
breaks_to_interval(breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound
#> 1 [0, 1) 0 1
#> 2 [1, 5) 1 5
#> 3 [5, 15) 5 15
#> 4 [15, 25) 15 25
#> 5 [25, 45) 25 45
#> 6 [45, 65) 45 65
#> 7 [65, Inf) 65 Inf
breaks_to_interval(breaks = c(1L, 5L, 15L), max_upper = 25L)
#> interval lower_bound upper_bound
#> 1 [1, 5) 1 5
#> 2 [5, 15) 5 15
#> 3 [15, 25) 15 25
cut_ages()
cut_ages()
provides categorisation of ages based on specified breaks which
represent the left-hand interval limits. Categorisation is based on the breaks
and follows the approach of breaks_to_interval
.
cut_ages(ages = 0:9, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound
#> 1 [0, 1) 0 1
#> 2 [1, 5) 1 5
#> 3 [1, 5) 1 5
#> 4 [1, 5) 1 5
#> 5 [1, 5) 1 5
#> 6 [5, 15) 5 15
#> 7 [5, 15) 5 15
#> 8 [5, 15) 5 15
#> 9 [5, 15) 5 15
#> 10 [5, 15) 5 15
cut_ages(1:10, breaks = c(0L, 4L), max_upper = 9L)
#> interval lower_bound upper_bound
#> 1 [0, 4) 0 4
#> 2 [0, 4) 0 4
#> 3 [0, 4) 0 4
#> 4 [4, 9) 4 9
#> 5 [4, 9) 4 9
#> 6 [4, 9) 4 9
#> 7 [4, 9) 4 9
#> 8 [4, 9) 4 9
#> 9 <NA> NA NA
#> 10 <NA> NA NA
x <- cut_ages(1:100, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
str(x)
#> 'data.frame': 100 obs. of 3 variables:
#> $ interval : Ord.factor w/ 7 levels "[0, 1)"<"[1, 5)"<..: 2 2 2 2 3 3 3 3 3 3 ...
#> $ lower_bound: num 1 1 1 1 5 5 5 5 5 5 ...
#> $ upper_bound: num 5 5 5 5 15 15 15 15 15 15 ...
head(x$interval)
#> [1] [1, 5) [1, 5) [1, 5) [1, 5) [5, 15) [5, 15)
#> 7 Levels: [0, 1) < [1, 5) < [5, 15) < [15, 25) < [25, 45) < ... < [65, Inf)
split_interval_counts()
split_interval_counts()
splits counts within a age interval in to counts for
individuals years based on a given weighting. Age intervals are specified by
their lower (closed) and upper (open) bounds, i.e. intervals of the form
[lower, upper).
# by default counts are split equally across ages within intervals
split_interval_counts(
lower_bounds = c(0L, 5L, 10L),
upper_bounds = c(5L, 10L, 20L),
counts = c(5L, 10L, 30L)
)
#> age count
#> 1 0 1
#> 2 1 1
#> 3 2 1
#> 4 3 1
#> 5 4 1
#> 6 5 2
#> 7 6 2
#> 8 7 2
#> 9 8 2
#> 10 9 2
#> 11 10 3
#> 12 11 3
#> 13 12 3
#> 14 13 3
#> 15 14 3
#> 16 15 3
#> 17 16 3
#> 18 17 3
#> 19 18 3
#> 20 19 3
# Population weightings to apply for individual years can be specified by
# the weights argument. If these are specified, they must be of length
# `max_upper` and represent weights in the range 0:(max_upper - 1).
max_upper <- 20L
weights <- integer(max_upper)
weights[c(TRUE, FALSE)] <- 1L
split_interval_counts(
lower_bounds = c(0L, 5L, 10L),
upper_bounds = c(5L, 10L, 20L),
counts = c(5L, 10L, 30L),
max_upper = max_upper,
weights <- weights
)
#> age count
#> 1 0 1.666667
#> 2 1 0.000000
#> 3 2 1.666667
#> 4 3 0.000000
#> 5 4 1.666667
#> 6 5 0.000000
#> 7 6 5.000000
#> 8 7 0.000000
#> 9 8 5.000000
#> 10 9 0.000000
#> 11 10 6.000000
#> 12 11 0.000000
#> 13 12 6.000000
#> 14 13 0.000000
#> 15 14 6.000000
#> 16 15 0.000000
#> 17 16 6.000000
#> 18 17 0.000000
#> 19 18 6.000000
#> 20 19 0.000000
aggregate_age_counts()
aggregate_age_counts()
provides aggregation of counts across ages (in years).
It is similar to a cut()
and tapply()
pattern but optimised for speed over
flexibility. Groupings are the same as in cut_ages()
and counts will
be provided across all natural numbers as well as for missing values.
# default ages generated as 0:(length(counts) - 1L) if only counts provided.
aggregate_age_counts(counts = 1:65, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 1
#> 2 [1, 5) 1 5 14
#> 3 [5, 15) 5 15 105
#> 4 [15, 25) 15 25 205
#> 5 [25, 45) 25 45 710
#> 6 [45, 65) 45 65 1110
#> 7 [65, Inf) 65 Inf 0
# NA ages are also handled with their own grouping
ages <- 1:65
ages[1:44] <- NA
aggregate_age_counts(
counts = 1:65,
ages = ages,
breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L)
)
#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 0
#> 2 [1, 5) 1 5 0
#> 3 [5, 15) 5 15 0
#> 4 [15, 25) 15 25 0
#> 5 [25, 45) 25 45 0
#> 6 [45, 65) 45 65 1090
#> 7 [65, Inf) 65 Inf 65
#> 8 <NA> NA NA 990
reaggregate_interval_counts()
reaggregate_interval_counts()
is equivalent to, but more efficient than a call
to to split_interval_counts()
followed by aggregate_age_counts()
.
The example below shows how it can be used to redistribute counts across a desired set of age intervals. We use data included in the package that has been obtained from the 2021 census and modify this based on our desired interval limits.
# census data
data(pop_dat)
pop_dat
#> area_code area_name age_category value
#> 1 K04000001 England and Wales [0, 5) 3232100
#> 2 K04000001 England and Wales [5, 10) 3524600
#> 3 K04000001 England and Wales [10, 15) 3595900
#> 4 K04000001 England and Wales [15, 20) 3394700
#> 5 K04000001 England and Wales [20, 25) 3602100
#> 6 K04000001 England and Wales [25, 30) 3901800
#> 7 K04000001 England and Wales [30, 35) 4148800
#> 8 K04000001 England and Wales [35, 40) 3981600
#> 9 K04000001 England and Wales [40, 45) 3755700
#> 10 K04000001 England and Wales [45, 50) 3788700
#> 11 K04000001 England and Wales [50, 55) 4123400
#> 12 K04000001 England and Wales [55, 60) 4029000
#> 13 K04000001 England and Wales [60, 65) 3455700
#> 14 K04000001 England and Wales [65, 70) 2945100
#> 15 K04000001 England and Wales [70, 75) 2978000
#> 16 K04000001 England and Wales [75, 80) 2170300
#> 17 K04000001 England and Wales [80, 85) 1517000
#> 18 K04000001 England and Wales [85, 90) 925100
#> 19 K04000001 England and Wales [90, Inf) 527900
# each row is for the same region so discard for moment
dat <- subset(pop_dat, select = c(age_category, value))
# extract upper and lower bounds
dat <- transform(
dat,
lower_bound = as.numeric(sub("\\[([0-9]+), .+)", "\\1", age_category)),
upper_bound = as.numeric(sub(".+, (.+))", "\\1", age_category))
)
head(dat, n=10)
#> age_category value lower_bound upper_bound
#> 1 [0, 5) 3232100 0 5
#> 2 [5, 10) 3524600 5 10
#> 3 [10, 15) 3595900 10 15
#> 4 [15, 20) 3394700 15 20
#> 5 [20, 25) 3602100 20 25
#> 6 [25, 30) 3901800 25 30
#> 7 [30, 35) 4148800 30 35
#> 8 [35, 40) 3981600 35 40
#> 9 [40, 45) 3755700 40 45
#> 10 [45, 50) 3788700 45 50
# recategorise based on ages
with(
dat,
reaggregate_interval_counts(
lower_bounds = lower_bound,
upper_bounds = upper_bound,
counts = value,
breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L),
max_upper = 100L,
weights = NULL
)
)
#> Warning in reaggregate_interval_counts(lower_bounds = lower_bound, upper_bounds
#> = upper_bound, : `upper_bounds` greater than `max_upper` (100) have been
#> replaced prior to splitting.
#> interval lower_bound upper_bound count
#> 1 [0, 1) 0 1 646420
#> 2 [1, 5) 1 5 2585680
#> 3 [5, 15) 5 15 7120500
#> 4 [15, 25) 15 25 6996800
#> 5 [25, 45) 25 45 15787900
#> 6 [45, 65) 45 65 15396800
#> 7 [65, Inf) 65 Inf 11063400