#ifndef lint static char Rcs_Id[] = "$Id: defmt.c,v 1.58 2001/08/01 22:15:56 geoff Exp $"; #endif /* * defmt.c - Handle formatter constructs, mostly by scanning over them. * * This code originally resided in ispell.c, but was moved here to keep * file sizes smaller. * * Copyright (c), 1983, by Pace Willisson * * Copyright 1992, 1993, 1999, 2001, Geoff Kuenning, Claremont, CA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All modifications to the source code must be clearly marked as * such. Binary redistributions based on modified source code * must be clearly marked as modified versions in the documentation * and/or other materials provided with the distribution. * 4. Any web site or other electronic service that offers ispell for * download or other electronic transfer as a separate entity, in * either source or binary form, must also include a prominent statement * indicating that information about ispell can be obtained from the * following Web site URL: * http://fmg-www.cs.ucla.edu/geoff/ispell.html * If the offering service supports hyperlinks, the aforementioned * Web site must also be offered as a hyperlink. Condition #4 does * not apply if ispell is offered only as part of a larger, aggregated * product such as a word processor or packaged operating system. * 5. The name of Geoff Kuenning may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * The TeX code is originally by Greg Schaffer, with many improvements from * Ken Stevens. The nroff code is primarily from Pace Willisson, although * other people have improved it. */ /* * $Log: defmt.c,v $ * Revision 1.58 2001/08/01 22:15:56 geoff * When processing quoted strings inside HTML tags, don't handle * ampersand sequences unless the quoted string is being spell-checked. * That way, ampersands inside HREF links won't confuse the deformatter, * but ampersand sequence inside ALT= tags will be handled correctly. * Also, when skipping ampersand sequences inside quoted strings, give up * the search for the semicolon if the closing quote is hit. This change * makes us a bit more robust in the fact of HTML syntax errors. * * Revision 1.57 2001/07/25 21:51:47 geoff * Minor license update. * * Revision 1.56 2001/07/23 20:24:03 geoff * Update the copyright and the license. * * Revision 1.55 2000/08/22 10:52:25 geoff * Fix a whole bunch of signed/unsigned compiler warnings. * * Revision 1.54 2000/08/22 00:11:25 geoff * Add support for correct_verbose_mode. * * Revision 1.53 1999/11/04 08:16:54 geoff * Add a few more special TeX sequences * * Revision 1.52 1999/01/18 03:28:27 geoff * Turn many char declarations into unsigned char, so that we won't have * sign-extension problems. * * Revision 1.51 1999/01/07 01:57:53 geoff * Update the copyright. * * Revision 1.50 1999/01/03 01:46:28 geoff * Add support for external deformatters. * * Revision 1.49 1998/07/07 02:30:53 geoff * Implement the plus notation for adding to keyword lists * * Revision 1.48 1998/07/06 06:55:13 geoff * Generalize the HTML tag routines to be handy keyword lookup routines, * and fix some (but by no means all) of the TeX deformatting to take * advantage of these routines to allow a bit more flexibility in * processing private TeX commands. * * Revision 1.47 1998/07/06 05:34:10 geoff * Add a few more TeX commands * * Revision 1.46 1997/12/01 00:53:47 geoff * Add HTML support. Fix the "\ " bug again, this time (hopefully) right. * * Revision 1.45 1995/11/08 05:09:29 geoff * Add the new interactive mode ("askverbose"). * * Revision 1.44 1995/11/08 04:32:51 geoff * Modify the HTML support to be stylistically more consistent and to * interoperate more cleanly with the nroff and TeX modes. * * Revision 1.43 1995/10/25 04:05:31 geoff * html-mode code added by Gerry Tierney 14th of * Oct '95. * * Revision 1.42 1995/10/25 03:35:42 geoff * After skipping over a backslash sequence, make sure that we don't skip * over the first character of the following word. Also, support the * verbatim environment of LaTex. * * Revision 1.41 1995/08/05 23:19:47 geoff * Get rid of an obsolete comment. Add recognition of documentclass and * usepackage for Latex2e support. * * Revision 1.40 1995/03/06 02:42:43 geoff * Change TeX backslash processing so that it only assumes alpha * characters and the commonly-used "@" character are part of macro * names, and so that any other backslashed character (specifically * dollar signs) is skipped. * * Revision 1.39 1995/01/08 23:23:54 geoff * Fix typos in a couple of comments. * * Revision 1.38 1995/01/03 19:24:14 geoff * Add code to handle the LaTeX \verb command. * * Revision 1.37 1994/12/27 23:08:54 geoff * Fix a bug in TeX backslash processing that caused ispell to become * confused when it encountered an optional argument to a * double-backslash command. Be a little smarter about scanning for * curly-brace matches, so that we avoid missing a math-mode transition * during the scan. * * Revision 1.36 1994/10/25 05:46:34 geoff * Recognize a few more Latex commands: pagestyle, pagenumbering, * setcounter, addtocounter, setlength, addtolength, settowidth. * * Revision 1.35 1994/10/18 04:03:19 geoff * Add code to skip hex numbers if they're preceded by '0x'. * * Revision 1.34 1994/10/04 03:51:24 geoff * Modify the parsing so that TeX commands are ignored even within * comments, but do not affect the overall parsing state. (This is * slightly imperfect, in that some types of modality are ignored when * comments are entered. But it should solve nearly all the problems * with commented-out TeX commands.) This also fixes a couple of minor * bugs with TeX deformatting. * * Revision 1.33 1994/10/03 17:06:07 geoff * Remember to use contextoffset when reporting complete misses * * Revision 1.32 1994/08/31 05:58:41 geoff * Report the offset-within-line correctly in -a mode even if the line is * longer than BUFSIZ characters. * * Revision 1.31 1994/05/25 04:29:28 geoff * If two boundary characters appear in a row, consider it the end of the * word. * * Revision 1.30 1994/05/17 06:44:08 geoff * Add the new argument to all calls to good and compoundgood. * * Revision 1.29 1994/03/16 06:30:41 geoff * Don't lose track of math mode when an array environment is embedded. * * Revision 1.28 1994/03/15 05:31:57 geoff * Add TeX_strncmp, which allows us to handle AMS-TeX constructs like * \endroster without getting confused. * * Revision 1.27 1994/02/14 00:34:53 geoff * Pass length arguments to correct(). * * Revision 1.26 1994/01/25 07:11:25 geoff * Get rid of all old RCS log lines in preparation for the 3.1 release. * */ #include #include "config.h" #include "ispell.h" #include "proto.h" #include "msgs.h" static unsigned char * skiptoword P ((unsigned char * bufp)); unsigned char * skipoverword P ((unsigned char * bufp)); void checkline P ((FILE * ofile)); static int TeX_math_end P ((unsigned char ** bufp)); static int TeX_math_begin P ((unsigned char ** bufp)); static int TeX_LR_begin P ((unsigned char ** bufp)); static int TeX_LR_check P ((int begin_p, unsigned char ** bufp)); static void TeX_skip_args P ((unsigned char ** bufp)); static int TeX_math_check P ((int cont_char, unsigned char ** bufp)); static void TeX_skip_parens P ((unsigned char ** bufp)); static void TeX_open_paren P ((unsigned char ** bufp)); static void TeX_skip_check P ((unsigned char ** bufp)); static int TeX_strncmp P ((unsigned char * a, char * b, int n)); int init_keyword_table P ((char * rawtags, char * envvar, char * deftags, int ignorecase, struct kwtable * keywords)); static int keyword_in_list P ((unsigned char * string, unsigned char * stringend, struct kwtable * keywords)); static int tagcmp P ((unsigned char ** a, unsigned char ** b)); #define ISTEXTERM(c) (((c) == TEXLEFTCURLY) || \ ((c) == TEXRIGHTCURLY) || \ ((c) == TEXLEFTSQUARE) || \ ((c) == TEXRIGHTSQUARE)) #define ISMATHCH(c) (((c) == TEXBACKSLASH) || \ ((c) == TEXDOLLAR) || \ ((c) == TEXPERCENT)) static int TeX_comment = 0; /* * The following variables are used to save the parsing state when * processing comments. This allows comments to be parsed without * affecting the overall nesting. */ static int save_math_mode; static char save_LaTeX_Mode; /* * The following variables are used by the deformatter to keep * track of keywords that may indicate text to be ignored. */ static unsigned char * keywordbuf; /* Scratch buffer for keyword comparison */ static unsigned int maxkeywordlen; /* Length of longest keyword */ static unsigned char * skiptoword (bufp) /* Skip to beginning of a word */ unsigned char * bufp; { unsigned char * htmltagstart; /* Beginning of an HTML tag */ unsigned char * htmlsubfield = bufp; /* Ptr to start of subfield name */ while (*bufp && ((!isstringch (bufp, 0) && !iswordch (chartoichar (*bufp))) || isboundarych (chartoichar (*bufp)) || (tflag == DEFORMAT_TEX && (math_mode & 1)) || (insidehtml & (HTML_IN_SPEC | HTML_ISIGNORED)) != 0 || ((insidehtml & HTML_IN_TAG) != 0 && (insidehtml & (HTML_IN_QUOTE | HTML_CHECKING_QUOTE)) != (HTML_IN_QUOTE | HTML_CHECKING_QUOTE)) ) ) { /* * HTML deformatting */ if (tflag == DEFORMAT_SGML) { if ((insidehtml & HTML_IN_TAG) != 0 && *bufp == HTMLQUOTE) { if (insidehtml & HTML_IN_QUOTE) insidehtml &= ~(HTML_IN_QUOTE | HTML_CHECKING_QUOTE); else insidehtml |= HTML_IN_QUOTE; htmlsubfield = NULL; } else if ((insidehtml & (HTML_IN_TAG | HTML_IN_QUOTE)) == HTML_IN_TAG && *bufp == HTMLTAGEND) insidehtml &= ~(HTML_IN_TAG | HTML_IN_ENDTAG | HTML_CHECKING_QUOTE); /* * If we are checking an HTML file, we want to ignore any HTML * tags. These should start with a '<' and end with a '>', so * we simply skip over anything between these two symbols. If * we reach the end of the line before finding a matching '>', * we set 'insidehtml' appropriately. */ else if ((insidehtml & HTML_IN_TAG) == 0 && *bufp == HTMLTAGSTART) { insidehtml |= HTML_IN_TAG; bufp++; if (*bufp == HTMLSLASH) { bufp++; insidehtml |= HTML_IN_ENDTAG; } htmltagstart = bufp; /* * We found the start of an HTML tag. Skip to the end * of the tag. We assume that all tags are made up of * purely alphabetic characters. */ while (isalpha (*bufp)) bufp++; /* * Check to see if this is an ignored tag, and set * HTML_IGNORE properly. */ if (keyword_in_list (htmltagstart, bufp, &htmlignorelist)) { /* * Note that we use +/- here, rather than Boolean * operators. This is quite deliberate, because * it allows us to properly handle nested HTML * constructs that are supposed to be ignored. */ if (insidehtml & HTML_IN_ENDTAG) insidehtml -= HTML_IGNORE; else insidehtml += HTML_IGNORE; } htmlsubfield = NULL; if (bufp > htmltagstart) bufp--; } else if ((insidehtml & (HTML_IN_TAG | HTML_IN_QUOTE)) == HTML_IN_TAG) { if (htmlsubfield == NULL && isalpha (*bufp)) htmlsubfield = bufp; else if (htmlsubfield != NULL && !isalpha (*bufp)) { if (bufp != htmlsubfield && keyword_in_list (htmlsubfield, bufp, &htmlchecklist)) insidehtml |= HTML_CHECKING_QUOTE; htmlsubfield = NULL; } } /* * Skip over quoted entities such as """. These all * start with an ampersand and end with a semi-colon. We * do not need to worry about them extending over more * than one line. We also don't need to worry about them * being string characters, because the isstringch() test * above would have already broken us out of the enclosing * loop in that case. * * A complication is that quoted entities are only * interpreted in some quoted strings. For example, * they're valid in ALT= tags but not in HREF tags, where * ampersands have an entirely different meaning. We deal * with the problem by only interpreting HTMLSPECSTART if * we are checking the quoted string. */ else if ((insidehtml & HTML_IN_SPEC) != 0 || (*bufp == HTMLSPECSTART && ((insidehtml & HTML_CHECKING_QUOTE) || (insidehtml & HTML_IN_QUOTE) == 0))) { while (*bufp != HTMLSPECEND && *bufp != '\0') { if ((insidehtml & HTML_IN_QUOTE) && *bufp == HTMLQUOTE) { /* * The quoted string ended before the * ampersand sequence finished. The HTML is * probably incorrect, but it would be a * mistake to keep skipping until we reach the * next random semicolon. Instead, just stop * skipping right here. */ insidehtml &= ~(HTML_IN_QUOTE | HTML_CHECKING_QUOTE); break; } bufp++; } if (*bufp == '\0') insidehtml |= HTML_IN_SPEC; else insidehtml &= ~HTML_IN_SPEC; } } else if (tflag == DEFORMAT_TEX) /* TeX or LaTeX stuff */ { /* Odd numbers mean we are in "math mode" */ /* Even numbers mean we are in LR or */ /* paragraph mode */ if (*bufp == TEXPERCENT) { if (!TeX_comment) { save_math_mode = math_mode; save_LaTeX_Mode = LaTeX_Mode; math_mode = 0; LaTeX_Mode = 'P'; TeX_comment = 1; } } else if (math_mode & 1) { if ((LaTeX_Mode == 'e' && TeX_math_check('e', &bufp)) || (LaTeX_Mode == 'm' && TeX_LR_check(1, &bufp))) math_mode--; /* end math mode */ else { while (*bufp && !ISMATHCH(*bufp)) bufp++; if (*bufp == 0) break; if (TeX_math_end(&bufp)) math_mode--; } if (math_mode < 0) { (void) fprintf (stderr, DEFMT_C_TEX_MATH_ERROR); math_mode = 0; } } else { if (math_mode > 1 && *bufp == TEXRIGHTCURLY && (math_mode < (math_mode & 127) * 128)) math_mode--; /* re-enter math */ else if (LaTeX_Mode == 'm' || (math_mode && (math_mode >= (math_mode & 127) * 128) && (TeX_strncmp(bufp, "\\end", 4) == 0))) { if (TeX_LR_check(0, &bufp)) math_mode--; } else if (LaTeX_Mode == 'b' && TeX_math_check('b', &bufp)) { /* continued begin */ math_mode++; } else if (LaTeX_Mode == 'r') { /* continued "reference" */ TeX_skip_parens(&bufp); LaTeX_Mode = 'P'; } else if (TeX_math_begin(&bufp)) /* checks references and */ /* skips \ commands */ math_mode++; } if (*bufp == 0) break; } else if (tflag == DEFORMAT_NROFF) /* nroff deformatting */ { if (*bufp == NRBACKSLASH) { switch ( bufp[1] ) { case 'f': if(bufp[2] == NRLEFTPAREN) { /* font change: \f(XY */ bufp += 5; } else { /* ) */ /* font change: \fX */ bufp += 3; } continue; case 's': /* size change */ bufp += 2; if (*bufp == '+' || *bufp == '-') bufp++; /* This looks wierd 'cause we ** assume *bufp is now a digit. */ bufp++; if (isdigit (*bufp)) bufp++; continue; default: if (bufp[1] == NRLEFTPAREN) { /* extended char set */ /* escape: \(XX */ /* ) */ bufp += 4; continue; } else if (bufp[1] == NRSTAR) { if (bufp[2] == NRLEFTPAREN) bufp += 5; else bufp += 3; continue; } break; } } } /* * Skip hex numbers, but not if we're in non-terse askmode. * (In that case, we'd lose sync if we skipped hex.) */ if (*bufp == '0' && (bufp[1] == 'x' || bufp[1] == 'X') && (terse || !aflag)) { bufp += 2; while (isxdigit (*bufp)) bufp++; } else bufp++; } if (*bufp == '\0') { if (TeX_comment) { math_mode = save_math_mode; LaTeX_Mode = save_LaTeX_Mode; TeX_comment = 0; } } return bufp; } unsigned char * skipoverword (bufp) /* Return pointer to end of a word */ register unsigned char * bufp; /* Start of word -- MUST BE A REAL START */ { register unsigned char * lastboundary; register int scharlen; /* Length of a string character */ lastboundary = NULL; for ( ; ; ) { if (*bufp == '\0') { if (TeX_comment) { math_mode = save_math_mode; LaTeX_Mode = save_LaTeX_Mode; TeX_comment = 0; } break; } else if (l_isstringch(bufp, scharlen, 0)) { bufp += scharlen; lastboundary = NULL; } /* ** Note that we get here if a character satisfies ** isstringstart() but isn't in the string table; this ** allows string characters to start with word characters. */ else if (iswordch (chartoichar (*bufp))) { bufp++; lastboundary = NULL; } else if (isboundarych (chartoichar (*bufp))) { if (lastboundary == NULL) lastboundary = bufp; else if (lastboundary == bufp - 1) break; /* Double boundary -- end of word */ bufp++; } else break; /* End of the word */ } /* ** If the word ended in one or more boundary characters, ** the address of the first of these is in lastboundary, and it ** is the end of the word. Otherwise, bufp is the end. */ return (lastboundary != NULL) ? lastboundary : bufp; } void checkline (ofile) FILE * ofile; { register unsigned char * p; register unsigned char * endp; int hadlf; register int len; register int i; int ilen; currentchar = filteredbuf; len = strlen ((char *) filteredbuf) - 1; hadlf = filteredbuf[len] == '\n'; if (hadlf) { filteredbuf[len] = '\0'; contextbufs[0][len] = '\0'; } if (tflag == DEFORMAT_NROFF) { /* skip over .if */ if (*currentchar == NRDOT && (strncmp ((char *) currentchar + 1, "if t", 4) == 0 || strncmp ((char *) currentchar + 1, "if n", 4) == 0)) { copyout (¤tchar,5); while (*currentchar && myspace (chartoichar (*currentchar))) copyout (¤tchar, 1); } /* skip over .ds XX or .nr XX */ if (*currentchar == NRDOT && (strncmp ((char *) currentchar + 1, "ds ", 3) == 0 || strncmp ((char *) currentchar + 1, "de ", 3) == 0 || strncmp ((char *) currentchar + 1, "nr ", 3) == 0)) { copyout (¤tchar, 4); while (*currentchar && myspace (chartoichar (*currentchar))) copyout(¤tchar, 1); while (*currentchar && !myspace (chartoichar (*currentchar))) copyout(¤tchar, 1); if (*currentchar == 0) { if (!lflag && hadlf) (void) putc ('\n', ofile); return; } } } /* if this is a formatter command, skip over it */ if (tflag == DEFORMAT_NROFF && *currentchar == NRDOT) { while (*currentchar && !myspace (chartoichar (*currentchar))) { if (!aflag && !lflag) (void) putc (*currentchar, ofile); currentchar++; } if (*currentchar == 0) { if (!lflag && hadlf) (void) putc ('\n', ofile); return; } } for ( ; ; ) { p = skiptoword (currentchar); if (p != currentchar) copyout (¤tchar, p - currentchar); if (*currentchar == 0) break; p = ctoken; endp = skipoverword (currentchar); while (currentchar < endp && p < ctoken + sizeof ctoken - 1) *p++ = *currentchar++; *p = 0; if (strtoichar (itoken, ctoken, INPUTWORDLEN * sizeof (ichar_t), 0)) (void) fprintf (stderr, WORD_TOO_LONG ((char *) ctoken)); ilen = icharlen (itoken); if (lflag) { if (ilen > minword && !good (itoken, 0, 0, 0, 0) && !cflag && !compoundgood (itoken, 0)) (void) fprintf (ofile, "%s\n", (char *) ctoken); } else { if (aflag) { if (ilen <= minword) { /* matched because of minword */ if (!terse) { if (askverbose) (void) fprintf (ofile, "ok\n"); else { if (correct_verbose_mode) (void) fprintf (ofile, "* %s\n", ctoken ); else (void) fprintf (ofile, "*\n"); } } continue; } if (good (itoken, 0, 0, 0, 0)) { if (hits[0].prefix == NULL && hits[0].suffix == NULL) { /* perfect match */ if (!terse) { if (askverbose) (void) fprintf (ofile, "ok\n"); else { if (correct_verbose_mode) (void) fprintf (ofile, "* %s\n", ctoken ); else (void) fprintf (ofile, "*\n"); } } } else if (!terse) { /* matched because of root */ if (askverbose) (void) fprintf (ofile, "ok (derives from root %s)\n", (char *) hits[0].dictent->word); else { if (correct_verbose_mode) (void) fprintf (ofile, "+ %s %s\n", ctoken, hits[0].dictent->word); else (void) fprintf (ofile, "+ %s\n", hits[0].dictent->word); } } } else if (compoundgood (itoken, 0)) { /* compound-word match */ if (!terse) { if (askverbose) (void) fprintf (ofile, "ok (compound word)\n"); else { if (correct_verbose_mode) (void) fprintf (ofile, "- %s\n", ctoken); else (void) fprintf (ofile, "-\n"); } } } else { makepossibilities (itoken); if (pcount) { /* ** print & or ?, ctoken, then ** character offset, possibility ** count, and the possibilities. */ if (askverbose) (void) fprintf (ofile, "how about"); else (void) fprintf (ofile, "%c %s %d %d", easypossibilities ? '&' : '?', (char *) ctoken, easypossibilities, (int) ((currentchar - filteredbuf) - strlen ((char *) ctoken)) + contextoffset); for (i = 0; i < MAXPOSSIBLE; i++) { if (possibilities[i][0] == 0) break; (void) fprintf (ofile, "%c %s", i ? ',' : ':', possibilities[i]); } (void) fprintf (ofile, "\n"); } else { /* ** No possibilities found for word TOKEN */ if (askverbose) (void) fprintf (ofile, "not found\n"); else (void) fprintf (ofile, "# %s %d\n", (char *) ctoken, (int) ((currentchar - filteredbuf) - strlen ((char *) ctoken)) + contextoffset); } } } else { if (!quit) correct (ctoken, sizeof ctoken, itoken, sizeof itoken, ¤tchar); } } if (!aflag && !lflag) (void) fprintf (ofile, "%s", (char *) ctoken); } if (!lflag && hadlf) (void) putc ('\n', ofile); } /* must check for \begin{mbox} or whatever makes new text region. */ static int TeX_math_end (bufp) unsigned char ** bufp; { if (**bufp == TEXDOLLAR) { if ((*bufp)[1] == TEXDOLLAR) (*bufp)++; return 1; } else if (**bufp == TEXPERCENT) { if (!TeX_comment) { save_math_mode = math_mode; save_LaTeX_Mode = LaTeX_Mode; math_mode = 0; LaTeX_Mode = 'P'; TeX_comment = 1; } return 0; } /* processing extended TeX command */ (*bufp)++; if (**bufp == TEXRIGHTPAREN || **bufp == TEXRIGHTSQUARE) return 1; if (TeX_LR_begin (bufp)) /* check for switch back to LR mode */ return 1; if (TeX_strncmp (*bufp, "end", 3) == 0) /* find environment that is ending */ return TeX_math_check ('e', bufp); else return 0; } static int TeX_math_begin (bufp) unsigned char ** bufp; { int didskip = 0; if (**bufp == TEXDOLLAR) { if ((*bufp)[1] == TEXDOLLAR) (*bufp)++; return 1; } while (**bufp == TEXBACKSLASH) { didskip = 1; (*bufp)++; /* check for null char here? */ if (**bufp == TEXLEFTPAREN || **bufp == TEXLEFTSQUARE) return 1; else if (!isalpha(**bufp) && **bufp != '@') { (*bufp)++; continue; } else if (TeX_strncmp (*bufp, "begin", 5) == 0) { if (TeX_math_check ('b', bufp)) return 1; else (*bufp)--; } else { TeX_skip_check (bufp); return 0; } } /* * Ignore references for the tib (1) bibliography system, that * is, text between a ``[.'' or ``<.'' and ``.]'' or ``.>''. * We don't care whether they match, tib doesn't care either. * * A limitation is that the entire tib reference must be on one * line, or we break down and check the remainder anyway. */ if ((**bufp == TEXLEFTSQUARE || **bufp == TEXLEFTANGLE) && (*bufp)[1] == TEXDOT) { (*bufp)++; while (**bufp) { if (*(*bufp)++ == TEXDOT && (**bufp == TEXRIGHTSQUARE || **bufp == TEXRIGHTANGLE)) return TeX_math_begin (bufp); } return 0; } else if (didskip) { /* * If we've skipped over anything, it's possible that we are * pointing at an important character. If so, we need to back up * one byte, because our caller will increment bufp. Yes, * this is a kludge. This whole TeX deformatter is a mess. */ (*bufp)--; } return 0; } static int TeX_LR_begin (bufp) unsigned char ** bufp; { if ((TeX_strncmp (*bufp, "mbox", 4) == 0) || (TeX_strncmp (*bufp, "makebox", 7) == 0) || (TeX_strncmp (*bufp, "text", 4) == 0) || (TeX_strncmp (*bufp, "intertext", 9) == 0) || (TeX_strncmp (*bufp, "fbox", 4) == 0) || (TeX_strncmp (*bufp, "framebox", 8) == 0)) math_mode += 2; else if ((TeX_strncmp (*bufp, "parbox", 6) == 0) || (TeX_strncmp (*bufp, "raisebox", 8) == 0)) { math_mode += 2; TeX_open_paren (bufp); if (**bufp) (*bufp)++; else LaTeX_Mode = 'r'; /* same as reference -- skip {} */ } else if (TeX_strncmp (*bufp, "begin", 5) == 0) return TeX_LR_check (1, bufp); /* minipage */ else return 0; /* skip tex command name and optional or width arguments. */ TeX_open_paren (bufp); return 1; } static int TeX_LR_check (begin_p, bufp) int begin_p; unsigned char ** bufp; { TeX_open_paren (bufp); if (**bufp == 0) /* { */ { LaTeX_Mode = 'm'; return 0; /* remain in math mode until '}' encountered. */ } else LaTeX_Mode = 'P'; if (strncmp ((char *) ++(*bufp), "minipage", 8) == 0) { TeX_skip_parens (bufp); if (**bufp) (*bufp)++; if (begin_p) { TeX_skip_parens (bufp); /* now skip opt. args if on this line. */ math_mode += 2; /* indicate minipage mode. */ math_mode += ((math_mode & 127) - 1) * 128; } else { math_mode -= (math_mode & 127) * 128; if (math_mode < 0) { (void) fprintf (stderr, DEFMT_C_LR_MATH_ERROR); math_mode = 1; } } return 1; } (*bufp)--; return 0; } /* Skips the begin{ARG}, and optionally up to two {PARAM}{PARAM}'s to * the begin if they are required. However, Only skips if on this line. */ static void TeX_skip_args (bufp) unsigned char ** bufp; { register int skip_cnt = 0; /* Max of 2. */ if (strncmp((char *) *bufp, "tabular", 7) == 0 || strncmp((char *) *bufp, "minipage", 8) == 0) skip_cnt++; if (strncmp((char *) *bufp, "tabular*", 8) == 0) skip_cnt++; TeX_skip_parens (bufp); /* Skip to the end of the \begin{} parens */ if (**bufp) (*bufp)++; else return; if (skip_cnt--) TeX_skip_parens (bufp); /* skip 1st {PARAM}. */ else return; if (**bufp) (*bufp)++; else return; if (skip_cnt) TeX_skip_parens (bufp); /* skip to end of 2nd {PARAM}. */ } static int TeX_math_check (cont_char, bufp) int cont_char; unsigned char ** bufp; { TeX_open_paren (bufp); /* Check for end of line, continue later. */ if (**bufp == 0) { LaTeX_Mode = (char) cont_char; return 0; } else LaTeX_Mode = 'P'; if (strncmp ((char *) ++(*bufp), "equation", 8) == 0 || strncmp ((char *) *bufp, "eqnarray", 8) == 0 || strncmp ((char *) *bufp, "displaymath", 11) == 0 || strncmp ((char *) *bufp, "picture", 7) == 0 || strncmp ((char *) *bufp, "gather", 6) == 0 || strncmp ((char *) *bufp, "align", 5) == 0 || strncmp ((char *) *bufp, "multline", 8) == 0 || strncmp ((char *) *bufp, "flalign", 7) == 0 || strncmp ((char *) *bufp, "alignat", 7) == 0 #ifdef IGNOREBIB || strncmp ((char *) *bufp, "thebibliography", 15) == 0 #endif || strncmp ((char *) *bufp, "verbatim", 8) == 0 || strncmp ((char *) *bufp, "math", 4) == 0) { (*bufp)--; TeX_skip_parens (bufp); return 1; } if (cont_char == 'b') TeX_skip_args (bufp); else TeX_skip_parens (bufp); return 0; } static void TeX_skip_parens (bufp) unsigned char ** bufp; { while (**bufp && **bufp != TEXRIGHTCURLY && **bufp != TEXDOLLAR) (*bufp)++; } static void TeX_open_paren (bufp) unsigned char ** bufp; { while (**bufp && **bufp != TEXLEFTCURLY && **bufp != TEXDOLLAR) (*bufp)++; } static void TeX_skip_check (bufp) unsigned char ** bufp; { unsigned char * endp; int skip_ch; for (endp = *bufp; isalpha(*endp) || *endp == '@'; endp++) ; if (keyword_in_list (*bufp, endp, &texskip1list)) { TeX_skip_parens (bufp); if (**bufp == 0) LaTeX_Mode = 'r'; } else if (keyword_in_list (*bufp, endp, &texskip2list)) /* skip two args. */ { TeX_skip_parens (bufp); if (**bufp == 0) /* Only skips one {} if not on same line. */ LaTeX_Mode = 'r'; else /* Skip second arg. */ { (*bufp)++; TeX_skip_parens (bufp); if (**bufp == 0) LaTeX_Mode = 'r'; } } else if (TeX_strncmp (*bufp, "verb", 4) == 0) { skip_ch = (*bufp)[4]; *bufp += 5; while (**bufp != skip_ch && **bufp != '\0') (*bufp)++; } else { /* Optional tex arguments sometimes should and ** sometimes shouldn't be checked ** (eg \section [C Programming] {foo} vs ** \rule [3em] {0.015in} {5em}) ** SO -- we'll always spell-check it rather than make a ** full LaTeX parser. */ /* Must look at the space after the command. */ while (isalpha(**bufp) || **bufp == '@') (*bufp)++; /* ** Our caller expects to skip over a single character. So we ** need to back up by one. Ugh. */ (*bufp)--; } } /* * TeX_strncmp is like strncmp, except that it returns inequality if * the following character of a is alphabetic. We do not use * iswordch here because TeX itself won't normally accept * nonalphabetics (except maybe on ISO Latin-1 installations? I'll * have to look into that). As a special hack, because LaTeX uses the * @ sign so much, we'll also accept that character. * * Properly speaking, the @ sign should be settable in the hash file * header, but I doubt that it varies, and I don't want to change the * syntax of affix files right now. * * Incidentally, TeX_strncmp uses uneual signedness for its arguments * because that's how it's always called, and it's easier to do one * typecast here than lots of casts in the calls. */ static int TeX_strncmp (a, b, n) unsigned char * a; /* Strings to compare */ char * b; /* ... */ int n; /* Number of characters to compare */ { int cmpresult; /* Result of calling strncmp */ cmpresult = strncmp ((char *) a, b, n); if (cmpresult == 0) { if (isascii (a[n]) && isalpha (a[n])) return 1; /* Force inequality if alpha follows */ } return cmpresult; } /* * Set up a table of keywords to be treated specially. The input is * "rawtags", which is a comma-separated list of keywords to be * inserted into the table. If rawtags is null, the environment * variable "envvar" is consulted; if this isn't set, the list in * "deftags" is used instead. * * If either rawtags or the contents of envvar begins with a + sign, * then this value is appended to the string in deftags, rather than * replacing it. This effect is cumulative. Thus, there are the * following possibilities: * * rawtags envvar deftags result * xxx * * xxx * +xxx yyy * xxx,yyy * +xxx +yyy zzz xxx,yyy,zzz * null yyy * yyy * null +yyy zzz yyy,zzz * null null zzz zzz * * The result of this routine is the initialization of a "struct * kwtable" that can be passed to keyword_in_list for lookup purposes. * Before the first time this routine is called, the "kwlist" pointer * in the struct kwtable must be set to NULL, to indicate that the * structure is uninitialized. * * Returns nonzero if the list has already been initialized, zero if * all is well. */ int init_keyword_table (rawtags, envvar, deftags, ignorecase, keywords) char * rawtags; /* Comma-separated list of tags to look up */ char * envvar; /* Environment variable containing tag list */ char * deftags; /* Default tag list */ int ignorecase; /* NZ to ignore case in keyword matching */ struct kwtable * keywords; /* Where to put the keyword table */ { char * end; /* End of current tag */ char * envtags; /* Tags from envvar */ unsigned char ** nextkw; /* Next keyword-table entry */ char * start; /* Start of current tag */ char * wlist; /* Modifiable copy of raw list */ unsigned int wsize; /* Size of wlist */ if (keywords->kwlist != NULL) return 1; envtags = (envvar == NULL) ? NULL : getenv (envvar); if (rawtags != NULL && rawtags[0] != '+') { envtags = NULL; deftags = NULL; } if (envtags != NULL && envtags[0] != '+') deftags = NULL; /* * Allocate space for the modifiable tag list. This may * over-allocate by up to 2 bytes if the "+" notation is used. */ wsize = 0; if (rawtags != NULL) wsize += strlen (rawtags) + 1; if (envtags != NULL) wsize += strlen (envtags) + 1; if (deftags != NULL) wsize += strlen (deftags) + 1; wlist = malloc (wsize); if (wlist == NULL) { (void) fprintf (stderr, DEFMT_C_NO_SPACE); exit (1); } wlist[0] = '\0'; if (rawtags != NULL) { if (rawtags[0] == '+') rawtags++; strcpy (wlist, rawtags); } if (envtags != NULL) { if (envtags[0] == '+') envtags++; if (wlist[0] != '\0') strcat (wlist, ","); strcat (wlist, envtags); } if (deftags != NULL) { if (wlist[0] != '\0') strcat (wlist, ","); strcat (wlist, deftags); } /* * Count the keywords and allocate space for the pointers. */ keywords->numkw = 1; keywords->forceupper = ignorecase; for (end = wlist; *end != '\0'; ++end) { if (*end == ',' || *end == ':') ++keywords->numkw; } keywords->kwlist = (unsigned char **) malloc (keywords->numkw * sizeof keywords->kwlist[0]); if (keywords->kwlist == NULL) { fprintf (stderr, DEFMT_C_NO_SPACE); exit (1); } end = wlist; nextkw = keywords->kwlist; keywords->maxlen = 0; keywords->minlen = 0; while (nextkw < keywords->kwlist + keywords->numkw) { for (start = end; *end != '\0' && *end != ',' && *end != ':'; end++) ; *end = '\0'; if (ignorecase) chupcase ((unsigned char *) start); if (end == start) --keywords->numkw; else { *nextkw++ = (unsigned char *) start; if ((unsigned) (end - start) > keywords->maxlen) keywords->maxlen = end - start; if (keywords->minlen == 0 || (unsigned) (end - start) < keywords->minlen) keywords->minlen = end - start; } end++; } qsort ((char *) keywords->kwlist, keywords->numkw, sizeof keywords->kwlist[0], (int (*) P ((const void *, const void *))) tagcmp); if (keywords->maxlen > maxkeywordlen) { maxkeywordlen = keywords->maxlen; if (keywordbuf != NULL) free (keywordbuf); keywordbuf = (unsigned char *) malloc ((maxkeywordlen + 1) * sizeof keywordbuf[0]); if (keywordbuf == NULL) { fprintf (stderr, DEFMT_C_NO_SPACE); exit(1); } } return 0; } /* * Decide whether a given keyword is in a list of those that need * special treatment. The list of tags is so small that there's * little point in being fancy, but the code given to me used a binary * search and it seemed silly to throw it away. * * Returns nonzero if the keyword is in the chosen list. */ static int keyword_in_list (str, strend, keywords) unsigned char * str; /* String to be tested (not null-terminated) */ unsigned char * strend; /* Character following end of test string */ struct kwtable * keywords; /* Table of keywords to be searched */ { int cmpresult; /* Result of string comparison */ unsigned int i; /* Current binary-search position */ int imin; /* Bottom of binary search */ int imax; /* Top of binary search */ i = strend - str; if (i < keywords->minlen || i > keywords->maxlen) return 0; strncpy ((char *) keywordbuf, (char *) str, i); keywordbuf[i] = '\0'; if (keywords->forceupper) chupcase (keywordbuf); /* * Binary search through the tags list */ imin = 0; imax = keywords->numkw - 1; while (imin <= imax) { i = (imin + imax) >> 1; cmpresult = strcmp ((char *) keywordbuf, (char *) keywords->kwlist[i]); if (cmpresult == 0) return 1; else if (cmpresult > 0 ) imin = i + 1; else imax = i - 1; } return 0; /* Not a tag to be ignored */ } /* * Compare two pointed-to strings */ static int tagcmp (a, b) unsigned char ** a; /* Strings to be compared */ unsigned char ** b; /* ... */ { return strcmp ((char *) *a, (char *) *b); }