/*
MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
$Id: lbfgs.c 1528 2006-08-07 02:39:50Z taku $;
lbfgs.c was ported from the FORTRAN code of lbfgs.m to C
using f2c converter
http://www.ece.northwestern.edu/~nocedal/lbfgs.html
Software for Large-scale Unconstrained Optimization
L-BFGS is a limited-memory quasi-Newton code for unconstrained optimization.
The code has been developed at the Optimization Technology Center,
a joint venture of Argonne National Laboratory and Northwestern University.
Authors
Jorge Nocedal
References
- J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage(1980),
Mathematics of Computation 35, pp. 773-782.
- D.C. Liu and J. Nocedal. On the Limited Memory Method for
Large Scale Optimization(1989),
Mathematical Programming B, 45, 3, pp. 503-528.
*/
#include "lbfgs.h"
#include "common.h"
#include <cmath>
#include <iostream>
#include <numeric>
#define min(a, b) ((a) <= (b) ? (a) : (b))
#define max(a, b) ((a) >= (b) ? (a) : (b))
namespace {
static const double ftol = 1e-4;
static const double xtol = 1e-16;
static const double eps = 1e-7;
static const double lb3_1_gtol = 0.9;
static const double lb3_1_stpmin = 1e-20;
static const double lb3_1_stpmax = 1e20;
static const int lb3_1_mp = 6;
static const int lb3_1_lp = 6;
inline double pi(double x, double y) {
return CRFPP::sigma(x) == CRFPP::sigma(y) ?x : 0.0;
}
inline void daxpy_(int n, double da, const double *dx, double *dy) {
for (int i = 0; i < n; ++i)
dy[i] += da * dx[i];
}
inline double ddot_(int size, const double *dx, const double *dy) {
return std::inner_product(dx, dx + size, dy, 0.0);
}
void mcstep(double *stx, double *fx, double *dx,
double *sty, double *fy, double *dy,
double *stp, double fp, double dp,
int *brackt,
double stpmin, double stpmax,
int *info) {
bool bound = true;
double p, q, s, d1, d2, d3, r, gamma, theta, stpq, stpc, stpf;
*info = 0;
if (*brackt && ((*stp <= min(*stx, *sty) || *stp >= max(*stx, *sty)) ||
*dx * (*stp - *stx) >= 0.0 || stpmax < stpmin)) {
return;
}
double sgnd = dp * (*dx / std::abs(*dx));
if (fp > *fx) {
*info = 1;
bound = true;
theta =(*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
d1 = std::abs(theta);
d2 = std::abs(*dx);
d1 = max(d1, d2);
d2 = std::abs(dp);
s = max(d1, d2);
d1 = theta / s;
gamma = s * std::sqrt(d1 * d1 - *dx / s *(dp / s));
if (*stp < *stx) {
gamma = -gamma;
}
p = gamma - *dx + theta;
q = gamma - *dx + gamma + dp;
r = p / q;
stpc = *stx + r * (*stp - *stx);
stpq = *stx + *dx / ((*fx - fp) /
(*stp - *stx) + *dx) / 2 * (*stp - *stx);
if ((d1 = stpc - *stx, std::abs(d1)) < (d2 = stpq - *stx, std::abs(d2))) {
stpf = stpc;
} else {
stpf = stpc + (stpq - stpc) / 2;
}
*brackt = true;
} else if (sgnd < 0.0) {
*info = 2;
bound = false;
theta = (*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
d1 = std::abs(theta);
d2 = std::abs(*dx);
d1 = max(d1, d2);
d2 = std::abs(dp);
s = max(d1, d2);
d1 = theta / s;
gamma = s * std::sqrt(d1 * d1 - *dx / s * (dp / s));
if (*stp > *stx) {
gamma = -gamma;
}
p = gamma - dp + theta;
q = gamma - dp + gamma + *dx;
r = p / q;
stpc = *stp + r *(*stx - *stp);
stpq = *stp + dp /(dp - *dx) * (*stx - *stp);
if ((d1 = stpc - *stp, std::abs(d1)) > (d2 = stpq - *stp, std::abs(d2))) {
stpf = stpc;
} else {
stpf = stpq;
}
*brackt = true;
} else if (std::abs(dp) < std::abs(*dx)) {
*info = 3;
bound = true;
theta = (*fx - fp) * 3 / (*stp - *stx) + *dx + dp;
d1 = std::abs(theta);
d2 = std::abs(*dx);
d1 = max(d1, d2);
d2 = std::abs(dp);
s = max(d1, d2);
d3 = theta / s;
d1 = 0.0;
d2 = d3 * d3 - *dx / s *(dp / s);
gamma = s * std::sqrt((max(d1, d2)));
if (*stp > *stx) {
gamma = -gamma;
}
p = gamma - dp + theta;
q = gamma + (*dx - dp) + gamma;
r = p / q;
if (r < 0.0 && gamma != 0.0) {
stpc = *stp + r *(*stx - *stp);
} else if (*stp > *stx) {
stpc = stpmax;
} else {
stpc = stpmin;
}
stpq = *stp + dp /(dp - *dx) * (*stx - *stp);
if (*brackt) {
if ((d1 = *stp - stpc, std::abs(d1)) <
(d2 = *stp - stpq, std::abs(d2))) {
stpf = stpc;
} else {
stpf = stpq;
}
} else {
if ((d1 = *stp - stpc, std::abs(d1)) >
(d2 = *stp - stpq, std::abs(d2))) {
stpf = stpc;
} else {
stpf = stpq;
}
}
} else {
*info = 4;
bound = false;
if (*brackt) {
theta =(fp - *fy) * 3 / (*sty - *stp) + *dy + dp;
d1 = std::abs(theta);
d2 = std::abs(*dy);
d1 = max(d1, d2);
d2 = std::abs(dp);
s = max(d1, d2);
d1 = theta / s;
gamma = s * std::sqrt(d1 * d1 - *dy / s * (dp / s));
if (*stp > *sty) {
gamma = -gamma;
}
p = gamma - dp + theta;
q = gamma - dp + gamma + *dy;
r = p / q;
stpc = *stp + r * (*sty - *stp);
stpf = stpc;
} else if (*stp > *stx) {
stpf = stpmax;
} else {
stpf = stpmin;
}
}
if (fp > *fx) {
*sty = *stp;
*fy = fp;
*dy = dp;
} else {
if (sgnd < 0.0) {
*sty = *stx;
*fy = *fx;
*dy = *dx;
}
*stx = *stp;
*fx = fp;
*dx = dp;
}
stpf = min(stpmax, stpf);
stpf = max(stpmin, stpf);
*stp = stpf;
if (*brackt && bound) {
if (*sty > *stx) {
d1 = *stx + (*sty - *stx) * 0.66;
*stp = min(d1, *stp);
} else {
d1 = *stx + (*sty - *stx) * 0.66;
*stp = max(d1, *stp);
}
}
return;
}
}
namespace CRFPP {
class LBFGS::Mcsrch {
private:
int infoc, stage1, brackt;
double finit, dginit, dgtest, width, width1;
double stx, fx, dgx, sty, fy, dgy, stmin, stmax;
public:
Mcsrch():
infoc(0),
stage1(0),
brackt(0),
finit(0.0), dginit(0.0), dgtest(0.0), width(0.0), width1(0.0),
stx(0.0), fx(0.0), dgx(0.0), sty(0.0), fy(0.0), dgy(0.0),
stmin(0.0), stmax(0.0) {}
void mcsrch(int size,
double *x,
double f, const double *g, double *s,
double *stp,
int *info, int *nfev, double *wa, bool orthant, double C) {
static const double p5 = 0.5;
static const double p66 = 0.66;
static const double xtrapf = 4.0;
static const int maxfev = 20;
/* Parameter adjustments */
--wa;
--s;
--g;
--x;
if (*info == -1) goto L45;
infoc = 1;
if (size <= 0 || *stp <= 0.0) return;
dginit = ddot_(size, &g[1], &s[1]);
if (dginit >= 0.0) return;
brackt = false;
stage1 = true;
*nfev = 0;
finit = f;
dgtest = ftol * dginit;
width = lb3_1_stpmax - lb3_1_stpmin;
width1 = width / p5;
for (int j = 1; j <= size; ++j) {
wa[j] = x[j];
}
stx = 0.0;
fx = finit;
dgx = dginit;
sty = 0.0;
fy = finit;
dgy = dginit;
while (true) {
if (brackt) {
stmin = min(stx, sty);
stmax = max(stx, sty);
} else {
stmin = stx;
stmax = *stp + xtrapf * (*stp - stx);
}
*stp = max(*stp, lb3_1_stpmin);
*stp = min(*stp, lb3_1_stpmax);
if ((brackt && ((*stp <= stmin || *stp >= stmax) ||
*nfev >= maxfev - 1 || infoc == 0)) ||
(brackt && (stmax - stmin <= xtol * stmax))) {
*stp = stx;
}
if (orthant) {
for (int j = 1; j <= size; ++j) {
double grad_neg = 0.0;
double grad_pos = 0.0;
double grad = 0.0;
if (wa[j] == 0.0) {
grad_neg = g[j] - 1.0 / C;
grad_pos = g[j] + 1.0 / C;
} else {
grad_pos = grad_neg = g[j] + 1.0 * sigma(wa[j]) / C;
}
if (grad_neg > 0.0) {
grad = grad_neg;
} else if (grad_pos < 0.0) {
grad = grad_pos;
} else {
grad = 0.0;
}
const double p = pi(s[j], -grad);
const double xi = wa[j] == 0.0 ? sigma(-grad) : sigma(wa[j]);
x[j] = pi(wa[j] + *stp * p, xi);
}
} else {
for (int j = 1; j <= size; ++j) {
x[j] = wa[j] + *stp * s[j];
}
}
*info = -1;
return;
L45:
*info = 0;
++(*nfev);
double dg = ddot_(size, &g[1], &s[1]);
double ftest1 = finit + *stp * dgtest;
if (brackt && ((*stp <= stmin || *stp >= stmax) || infoc == 0)) {
*info = 6;
}
if (*stp == lb3_1_stpmax && f <= ftest1 && dg <= dgtest) {
*info = 5;
}
if (*stp == lb3_1_stpmin && (f > ftest1 || dg >= dgtest)) {
*info = 4;
}
if (*nfev >= maxfev) {
*info = 3;
}
if (brackt && stmax - stmin <= xtol * stmax) {
*info = 2;
}
if (f <= ftest1 && std::abs(dg) <= lb3_1_gtol * (-dginit)) {
*info = 1;
}
if (*info != 0) {
return;
}
if (stage1 && f <= ftest1 && dg >= min(ftol, lb3_1_gtol) * dginit) {
stage1 = false;
}
if (stage1 && f <= fx && f > ftest1) {
double fm = f - *stp * dgtest;
double fxm = fx - stx * dgtest;
double fym = fy - sty * dgtest;
double dgm = dg - dgtest;
double dgxm = dgx - dgtest;
double dgym = dgy - dgtest;
mcstep(&stx, &fxm, &dgxm, &sty, &fym, &dgym, stp, fm, dgm, &brackt,
stmin, stmax, &infoc);
fx = fxm + stx * dgtest;
fy = fym + sty * dgtest;
dgx = dgxm + dgtest;
dgy = dgym + dgtest;
} else {
mcstep(&stx, &fx, &dgx, &sty, &fy, &dgy, stp, f, dg, &brackt,
stmin, stmax, &infoc);
}
if (brackt) {
double d1 = 0.0;
if ((d1 = sty - stx, std::abs(d1)) >= p66 * width1) {
*stp = stx + p5 * (sty - stx);
}
width1 = width;
width = (d1 = sty - stx, std::abs(d1));
}
}
return;
}
};
void LBFGS::clear() {
iflag_ = iscn = nfev = iycn = point = npt =
iter = info = ispt = isyt = iypt = 0;
stp = stp1 = 0.0;
diag_.clear();
w_.clear();
delete mcsrch_;
mcsrch_ = 0;
}
void LBFGS::lbfgs_optimize(int size,
int msize,
double *x,
double f,
const double *g,
double *diag,
double *w,
bool orthant,
double C,
int *iflag) {
double yy = 0.0;
double ys = 0.0;
int bound = 0;
int cp = 0;
--diag;
--g;
--x;
--w;
if (!mcsrch_) mcsrch_ = new Mcsrch;
if (*iflag == 1) goto L172;
if (*iflag == 2) goto L100;
// initialization
if (*iflag == 0) {
point = 0;
for (int i = 1; i <= size; ++i) {
diag[i] = 1.0;
}
ispt = size + (msize << 1);
iypt = ispt + size * msize;
for (int i = 1; i <= size; ++i) {
w[ispt + i] = -g[i] * diag[i];
}
stp1 = 1.0 / std::sqrt(ddot_(size, &g[1], &g[1]));
}
// MAIN ITERATION LOOP
while (true) {
++iter;
info = 0;
if (iter == 1) goto L165;
if (iter > size) bound = size;
// COMPUTE -H*G USING THE FORMULA GIVEN IN: Nocedal, J. 1980,
// "Updating quasi-Newton matrices with limited storage",
// Mathematics of Computation, Vol.24, No.151, pp. 773-782.
ys = ddot_(size, &w[iypt + npt + 1], &w[ispt + npt + 1]);
yy = ddot_(size, &w[iypt + npt + 1], &w[iypt + npt + 1]);
for (int i = 1; i <= size; ++i) {
diag[i] = ys / yy;
}
L100:
cp = point;
if (point == 0) cp = msize;
w[size + cp] = 1.0 / ys;
for (int i = 1; i <= size; ++i) {
w[i] = -g[i];
}
bound = min(iter - 1, msize);
cp = point;
for (int i = 1; i <= bound; ++i) {
--cp;
if (cp == -1) cp = msize - 1;
double sq = ddot_(size, &w[ispt + cp * size + 1], &w[1]);
int inmc = size + msize + cp + 1;
iycn = iypt + cp * size;
w[inmc] = w[size + cp + 1] * sq;
double d = -w[inmc];
daxpy_(size, d, &w[iycn + 1], &w[1]);
}
for (int i = 1; i <= size; ++i) {
w[i] = diag[i] * w[i];
}
for (int i = 1; i <= bound; ++i) {
double yr = ddot_(size, &w[iypt + cp * size + 1], &w[1]);
double beta = w[size + cp + 1] * yr;
int inmc = size + msize + cp + 1;
beta = w[inmc] - beta;
iscn = ispt + cp * size;
daxpy_(size, beta, &w[iscn + 1], &w[1]);
++cp;
if (cp == msize) cp = 0;
}
// STORE THE NEW SEARCH DIRECTION
for (int i = 1; i <= size; ++i) {
w[ispt + point * size + i] = w[i];
}
L165:
// OBTAIN THE ONE-DIMENSIONAL MINIMIZER OF THE FUNCTION
// BY USING THE LINE SEARCH ROUTINE MCSRCH
nfev = 0;
stp = 1.0;
if (iter == 1) {
stp = stp1;
}
for (int i = 1; i <= size; ++i) {
w[i] = g[i];
}
L172:
mcsrch_->mcsrch(size, &x[1], f, &g[1], &w[ispt + point * size + 1],
&stp, &info, &nfev, &diag[1], orthant, C);
if (info == -1) {
*iflag = 1; // next value
return;
}
if (info != 1) {
std::cerr << "The line search routine mcsrch failed: error code:"
<< info << std::endl;
*iflag = -1;
return;
}
// COMPUTE THE NEW STEP AND GRADIENT CHANGE
npt = point * size;
for (int i = 1; i <= size; ++i) {
w[ispt + npt + i] = stp * w[ispt + npt + i];
w[iypt + npt + i] = g[i] - w[i];
}
++point;
if (point == msize) point = 0;
double gnorm = std::sqrt(ddot_(size, &g[1], &g[1]));
double xnorm = max(1.0, std::sqrt(ddot_(size, &x[1], &x[1])));
if (gnorm / xnorm <= eps) {
*iflag = 0; // OK terminated
return;
}
}
return;
}
}
syntax highlighted by Code2HTML, v. 0.9.1