模拟Eq. 10.2¶
cnblogs | AdaBoost对实际数据分类的Julia实现 |
---|---|
作者 | szcf-weiya |
时间 | 2018-01-03 |
更新 | 2018-02-28, 2018-05-17@xinyu-intel |
本节是对10.1节中式(10.2)的模拟。
问题¶
特征 $X_1,\ldots,X_{10}$ 是标准独立高斯分布,目标$Y$定义如下
考虑 2000 个训练情形,每个类别大概有 1000 个情形,以及 10000 个测试观测值。若分类器选为“stump”(含两个终止结点的分类树)。
Julia实现¶
Julia的具体细节参见官方manual。
首先我们定义模型的结构,我们需要两个参数,弱分类器的个数 n_clf
和存储 n_clf
个弱分类器的 n_clf
$\times 4$ 的矩阵。因为对于每个弱分类器——两个终止结点的stump,我们需要三个参数确定,分割变量的编号idx
,该分割变量对应的cutpoint值val
,以及分类的方向flag
(当flag
取 1 时则所有比 cutpoint 大的观测值分到树的右结点,而 flag
取 0 时分到左结点),另外算法中需要确定的 alpha
参数,所以一个 stump 需要四个参数。下面代码默认弱分类器个数为 10。
struct Adaboost
n_clf::Int64
clf::Matrix
end
function Adaboost(;n_clf::Int64 = 10)
clf = zeros(n_clf, 4)
return Adaboost(n_clf, clf)
end
训练模型
function train!(model::Adaboost, X::Matrix, y::Vector)
n_sample, n_feature = size(X)
## initialize weight
w = ones(n_sample) / n_sample
threshold = 0
## indicate the classification direction
## consider observation obs which is larger than cutpoint.val
## if flag = 1, then classify obs as 1
## else if flag = -1, classify obs as -1
flag = 0
feature_index = 0
alpha = 0
for i = 1:model.n_clf
## step 2(a): stump
err_max = 1e10
for feature_ind = 1:n_feature
for threshold_ind = 1:n_sample
flag_ = 1
err = 0
threshold_ = X[threshold_ind, feature_ind]
for sample_ind = 1:n_sample
pred = 1
x = X[sample_ind, feature_ind]
if x < threshold_
pred = -1
end
err += w[sample_ind] * (y[sample_ind] != pred)
end
err = err / sum(w)
if err > 0.5
err = 1 - err
flag_ = -1
end
if err < err_max
err_max = err
threshold = threshold_
flag = flag_
feature_index = feature_ind
end
end
end
## step 2(c)
#alpha = 1/2 * log((1-err_max)/(err_max))
alpha = 1/2 * log((1.000001-err_max)/(err_max+0.000001))
## step 2(d)
for j = 1:n_sample
pred = 1
x = X[j, feature_index]
if flag * x < flag * threshold
pred = -1
end
w[j] = w[j] * exp(-alpha * y[j] * pred)
end
model.clf[i, :] = [feature_index, threshold, flag, alpha]
end
end
预测模型
function predict(model::Adaboost,
x::Matrix)
n = size(x,1)
res = zeros(n)
for i = 1:n
res[i] = predict(model, x[i,:])
end
return res
end
function predict(model::Adaboost,
x::Vector)
s = 0
for i = 1:model.n_clf
pred = 1
feature_index = trunc(Int64,model.clf[i, 1])
threshold = model.clf[i, 2]
flag = model.clf[i, 3]
alpha = model.clf[i, 4]
x_temp = x[feature_index]
if flag * x_temp < flag * threshold
pred = -1
end
s += alpha * pred
end
return sign(s)
end
接下来应用到模拟例子中
function generate_data(N)
p = 10
x = randn(N, p)
x2 = x.*x
c = 9.341818 #qchisq(0.5, 10)
y = zeros(Int64,N)
for i=1:N
tmp = sum(x2[i,:])
if tmp > c
y[i] = 1
else
y[i] = -1
end
end
return x,y
end
function test_Adaboost()
x_train, y_train = generate_data(2000)
x_test, y_test = generate_data(10000)
m = 1:20:400
res = zeros(size(m, 1))
for i=1:size(m, 1)
model = Adaboost(n_clf=m[i])
train!(model, x_train, y_train)
predictions = predict(model, x_test)
println("The number of week classifiers ", m[i])
res[i] = classification_error(y_test, predictions)
println("classification error: ", res[i])
end
return hcat(m, res)
end
作出误差随迭代次数的图象如下
可以发现,随着迭代次数的增加,误差率不断下降,与图10.2的结果是一致的。
完整代码参见这里
R语言实现¶
## generate dataset
genData <- function(N, p = 10)
{
p = 10
x = matrix(rnorm(N*p),nrow = N)
x2 = x^2
y = rowSums(x2)
y = sapply(y, function(x) ifelse(x > qchisq(0.5, 10), 1, -1))
return(list(x = x, y = y))
}
set.seed(123)
train.data = genData(2000)
table(train.data$y)
# y
# -1 1
# 980 1020
AdaBoost <- function(x, y, m = 10)
{
p = ncol(x)
N = nrow(x)
res = matrix(nrow = m, ncol = 4)
## initialize weight
w = rep(1, N)/N
## indicate the classification direction
## consider observation obs which is larger than cutpoint.val
## if flag = 1, then classify obs as 1
## else if flag = -1, classify obs as -1
flag = 1
cutpoint.val = 0
cutpoint.idx = 0
for (i in 1:m)
{
## step 2(a): stump
tol = 1e10
for (j in 1:p)
{
for (k in 1:N)
{
#err = 0
flag.tmp = 1
cutpoint.val.tmp = x[k, j]
# for (kk in 1:N)
# {
# pred = 1
# xx = x[kk, j]
# if (xx < cutpoint.val.tmp)
# pred = -1
# err = err + w[kk] * as.numeric(y[kk] != pred)
# }
xj = x[, j]
pred = sapply(xj, function(x) ifelse(x < cutpoint.val.tmp, -1, 1))
err = sum(w*as.numeric(y != pred))
}
if (err > 0.5)
{
err = 1 - err
flag.tmp = -1
}
if (err < tol)
{
tol = err
cutpoint.val = cutpoint.val.tmp
cutpoint.idx = j
flag = flag.tmp
}
}
## step 2(c)
#alpha = 0.5 * log((1-tol)/tol)
alpha = 0.5 * log((1+1e-6-tol)/(tol+1e-6))
## step 2(d)
for (k in 1:N)
{
pred = 1
xx = x[k, cutpoint.idx]
if (flag * xx < flag * cutpoint.val)
pred = -1
w[k] = w[k] * exp(-alpha*y[k]*pred)
}
res[i, ] = c(cutpoint.idx, cutpoint.val, flag, alpha)
}
colnames(res) = c("idx", "val", "flag", "alpha")
model = list(M = m, G = res)
class(model) = "AdaBoost"
return(model)
}
print.AdaBoost <- function(model)
{
cat(model$M, " weak classifers are as follows: \n\n")
cat(sprintf("%4s%12s%8s%12s\n", colnames(model$G)[1], colnames(model$G)[2], colnames(model$G)[3], colnames(model$G)[4]))
for (i in 1:model$M)
{
cat(sprintf("%4d%12f%8d%12f\n", model$G[i, 1], model$G[i, 2], model$G[i, 3], model$G[i, 4]))
}
}
predict.AdaBoost <- function(model, x)
{
n = nrow(x)
res = integer(n)
for (i in 1:n)
{
s = 0
xx = x[i, ]
for (j in 1:model$M)
{
pred = 1
idx = model$G[j, 1]
val = model$G[j, 2]
flag = model$G[j, 3]
alpha = model$G[j, 4]
cutpoint = xx[idx]
if (flag * cutpoint < flag*val)
pred = -1
s = s + alpha*pred
}
res[i] = sign(s)
}
return(res)
}
testAdaBoost <- function()
{
## train datasets
train.data = genData(2000)
x = train.data$x
y = train.data$y
## test datasets
test.data = genData(10000)
test.x = test.data$x
test.y = test.data$y
m = seq(1, 400, by = 20)
err = numeric(length(m))
for (i in 1:length(m))
{
model = AdaBoost(x, y, m = m[i])
res = table(test.y, predict(model, test.x))
err[i] = (res[1, 2] + res[2, 1])/length(test.y)
cat(sprintf("epoch = %d, err = %f", i, err[i]))
}
return(err)
}
运行速度要比Julia的慢很多。
R GBM实现¶
library(gbm)
## generate dataset
genData <- function(N, p = 10) {
p = 10
x = matrix(rnorm(N * p), nrow = N)
x2 = x^2
y = rowSums(x2)
y = sapply(y, function(x) ifelse(x > qchisq(0.5,
10), 1, -1))
return(data.frame(x, y))
}
set.seed(123)
train.data = genData(2000)
table(train.data$y)
# y
# -1 1
# 980 1020
## train datasets
train.data = genData(2000)
train.data[train.data$y == -1, 11] <- 0
## test datasets
test.data = genData(10000)
test.data[test.data$y == -1, 11] <- 0
# Do training with the maximum number of trees:
Adaboost = TRUE
n_trees = 400
if (Adaboost) {
print("Running Adaboost ...")
m = gbm(y ~ ., data = train.data, distribution = "adaboost",
n.trees = n_trees, shrinkage = 1, verbose = TRUE)
} else {
print("Running Bernoulli Boosting ...")
m = gbm(y ~ ., data = train.data, distribution = "bernoulli",
n.trees = n_trees, verbose = TRUE)
}
training_error = matrix(0, nrow = n_trees, ncol = 1)
for (nti in seq(1, n_trees)) {
Fhat = predict(m, train.data[, 1:10], n.trees = nti)
pcc = mean(((Fhat <= 0) & (train.data[, 11] ==
0)) | ((Fhat > 0) & (train.data[, 11] == 1)))
training_error[nti] = 1 - pcc
}
test_error = matrix(0, nrow = n_trees, ncol = 1)
for (nti in seq(1, n_trees)) {
Fhat = predict(m, test.data[, 1:10], n.trees = nti)
pcc = mean(((Fhat <= 0) & (test.data[, 11] == 0)) |
((Fhat > 0) & (test.data[, 11] == 1)))
test_error[nti] = 1 - pcc
}
plot(seq(1, n_trees), training_error, type = "l", main = "Boosting Probability of Error",
col = "red", xlab = "number of boosting iterations",
ylab = "classification error")
lines(seq(1, n_trees), test_error, type = "l", col = "green")
legend(275, 0.45, c("training error", "testing error"),
col = c("red", "green"), lty = c(1, 1))