/* $Header: /home/agc/src/libutf-2.10/RCS/utf.c,v 1.8 1997/03/14 16:27:18 agc Exp $ */ /* * Copyright © 1996-1997 Alistair G. Crooks. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Alistair G. Crooks. * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #ifdef HAVE_STRING_H #include #endif #ifdef HAVE_STDARG_H #include #endif #include "ure.h" #include "utf.h" #define RUNELEN(r) (((r) > 0 && (r) <= 0x007f) ? 1 : ((r) >= 0x0800) ? 3 : 2) #ifndef MIN #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #endif /* !MIN */ /* translate a single Rune to a UTF sequence, returning # of bytes produced */ int runetochar(char *cp, Rune *rp) { switch(RUNELEN(*rp)) { case 1: *cp++ = (unsigned char)(*rp & 0x7f); return 1; case 2: *cp++ = (0xc0 | ((*rp & 0x07c0) >> 6)); *cp++ = (0x80 | (*rp & 0x003f)); return 2; case 3: cp[0] = 0xe0 | ((*rp & 0xf000) >> 12); cp[1] = (0x80 | ((*rp & 0x0fc0) >> 6)); cp[2] = (0x80 | (*rp & 0x003f)); return 3; } /* can't happen */ return -1; } /* translate a UTF sequence to a Rune, returning # of bytes consumed */ int chartorune(Rune *rp, char *cp) { if (cp != (char *) NULL) { if ((*cp & 0x80) == 0) { *rp = (*cp & 0x7f); return 1; } if ((*cp & 0xe0) == 0xc0 && (*(cp + 1) & 0xc0) == 0x80) { *rp = ((*cp & 0x1f) << 6) | (cp[1] & 0x3f); return 2; } if ((*cp & 0xf0) == 0xe0 && (*(cp + 1) & 0xc0) == 0x80 && (*(cp + 2) & 0xc0) == 0x80) { *rp = ((*cp & 0x0f) << 12) | ((cp[1] & 0x3f) << 6) | (cp[2] & 0x3f); return 3; } } /* `rune error' - return error rune, length 1 */ *rp = Runeerror; return 1; } /* return the # of bytes in the UTF encoding of a Rune */ int runelen(long r) { return RUNELEN(r); } /* return 1 if `n' bytes of `cp' contains a complete UTF encoding */ int fullrune(char *cp, int n) { Rune r; int len; if (n > 0) { len = chartorune(&r, cp); if (r != Runeerror) { return (n == len); } } return 0; } /* return the number of runes in a UTF string */ int utflen(char *s) { Rune r; int rc; int i; rc = 0; for (;;) { i = chartorune(&r, s); if (r == 0) { break; } s += i; rc++; } return rc; } /* return the number of runes in an n-byte UTF string */ int utfnlen(char *s, int n) { Rune r; int rc; int i; for (rc = 0 ; n-- > 0 ; ) { i = chartorune(&r, s); if (r == 0) { break; } s += i; rc++; } return rc; } /* return the number of bytes in a UTF string */ int utfbytes(char *s) { char *cp; Rune r; int i; cp = s; for (;;) { i = chartorune(&r, cp); if (r == 0) { break; } cp += i; } return cp - s; } /* point to the first occurrence of `r' in the UTF sequence */ char * utfrune(char *cp, long r) { Rune rch; int len; for (;;) { len = chartorune(&rch, cp); if (rch == r) { return cp; } if (rch == 0) { return (char *) NULL; } cp += len; } } /* point to the last occurrence of `r' in the UTF sequence */ char * utfrrune(char *cp, long r) { char *last; Rune rch; int len; last = (char *) NULL; for (;;) { len = chartorune(&rch, cp); if (rch == r) { last = cp; } if (rch == 0) { return last; } cp += len; } } /* return the first occurrence of UTF string `little' in `big' */ char * utfutf(char *big, char *little) { Rune r; char *cp; int bytes; int len; cp = little; do { len = chartorune(&r, cp); cp += len; } while (r != 0); bytes = (cp - little) - len; (void) chartorune(&r, little); for (cp = big ; (cp = utfrune(cp, r)) != (char *) NULL ; ) { if (memcmp(cp, little, bytes) == 0) { return cp; } } return (char *) NULL; } /* compare, lexicographically by Rune, two UTF strings */ int utfcmp(char *s1, char *s2) { Rune r1; Rune r2; do { s1 += chartorune(&r1, s1); s2 += chartorune(&r2, s2); } while (r1 == r2 && r1 != 0 && r2 != 0); return r2 - r1; } /* compare, lexicographically by Rune, two UTF strings at most rc Runes long */ int utfncmp(char *s1, char *s2, int rc) { Rune r1; Rune r2; do { s1 += chartorune(&r1, s1); s2 += chartorune(&r2, s2); } while (r1 == r2 && r1 != 0 && r2 != 0 && rc-- > 0); return r2 - r1; } /* span in s1 for set s2, return # of bytes */ int utfspan(char *s1, char *s2, int *rc) { Rune r; char *cp; int i; cp = s1; *rc = 0; for (;;) { i = chartorune(&r, cp); if (r == 0 || utfrune(s2, r) == (char *) NULL) { return cp - s1; } cp += i; *rc += 1; } } /* span in s1 for the complement of set s2, return # of bytes */ int utfcspan(char *s1, char *s2, int *rc) { Rune r; char *cp; int i; cp = s1; *rc = 0; for (;;) { i = chartorune(&r, cp); if (r == 0 || utfrune(s2, r) != (char *) NULL) { return cp - s1; } cp += i; *rc += 1; } } /* get the rune prior to s, and return its length */ int priorrune(Rune *rp, char *s) { s -= 1; if ((*s & 0x80) != 0) { s -= 1; if ((*s & 0xe0) != 0xc0) { s -= 1; } } return chartorune(rp, s); } /* functionally equivalent to strpbrk for utf strings */ char * utffindrune(char *s, char *charset) { Rune r; char *cp; int i; for (;;) { i = chartorune(&r, charset); if (r == 0) { return (char *) NULL; } if ((cp = utfrune(s, r)) != (char *) NULL) { return cp; } charset += i; } } /* compare an array of `n' runes against a utf string */ int runeutfncmp(Rune *rp, char *up, int n) { Rune r; int diff; int rc; int i; for (diff = rc = 0 ; rc < n ; rc++) { i = chartorune(&r, up); up += i; if ((diff = r - rp[rc]) != 0) { break; } } return diff; } #ifndef HAVE_MEMMOVE /* overlapping-safe memory move function */ static char * memmove(char *dst, char *src, int nbytes) { char *ret; if ((ret = dst) >= src && dst <= &src[nbytes]) { for (dst += nbytes, src += nbytes ; nbytes-- > 0 ; ) { *--dst = *--src; } } else { while (nbytes-- > 0) { *dst++ = *src++; } } return ret; } #endif /* copy utf string src to dst */ char * utfcpy(char *dst, char *src) { char *cp; /* make sure we get the null byte */ return memmove(dst, src, utfbytes(src) + 1); } /* copy utf string src to dst */ char * utfncpy(char *dst, char *src, int nbytes) { int len; if ((len = utfbytes(src)) < nbytes) { dst[len] = 0; } return memmove(dst, src, MIN(len, nbytes)); } char * utfcat(char *src, char *append) { (void) utfcpy(&src[utfbytes(src)], append); return src; } char * utfncat(char *src, char *append, int slen) { int len; len = utfbytes(src); (void) utfncpy(&src[len], append, slen - len); return src; } /* a particluarly dumb implementation of snprintf - just does simple %[sdcx] */ int utf_snprintf(char *buf, int size, char *fmt, ...) { va_list vp; char lfmt[BUFSIZ]; char lbuf[BUFSIZ]; char done; char *lfp; char *fp; char *bp; char *s; char c; long i; va_start(vp, fmt); for (bp = buf, fp = fmt, done = 0 ; bp - buf < size && !done ; ) { switch(*fp) { case '%': switch(*++fp) { case '%': *bp++ = *fp++; break; case 'c': *bp++ = (char) va_arg(vp, int); fp++; break; case 's': if ((s = (char *) va_arg(vp, char *)) == (char *) NULL) { s = "(null)"; } for (i = utfbytes(s); bp - buf < size && i-- > 0 ; ) { *bp++ = *s++; } fp++; break; default: for (lfp = lfmt, *lfp++ = '%' ; isdigit(*fp) ; ) { *lfp++ = *fp++; } if (*fp == 'l') { *lfp++ = *fp++; i = (long) va_arg(vp, long); } else if (*fp == 'h') { *lfp++ = *fp++; i = (short) va_arg(vp, int); } else { i = (int) va_arg(vp, int); } if (*fp == 'd' || *fp == 'x' || *fp == 'o') { *lfp++ = *fp++; } else { *lfp++ = 'd'; } *lfp = 0; (void) sprintf(lbuf, lfmt, i); for (i = utfbytes(s = lbuf); bp - buf < size && i-- > 0 ; ) { *bp++ = *s++; } } break; case 0: done = 1; break; default: *bp++ = *fp++; } } if (bp - buf < size) { *bp = 0; } else { buf[size - 1] = 0; } va_end(vp); return bp - buf; }