Cleanup etags.c to use locale-independent code

author Paul Eggert <eggert@cs.ucla.edu>

Sat, 23 May 2015 22:18:12 +0000 (15:18 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Sat, 23 May 2015 22:19:02 +0000 (15:19 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Sat, 23 May 2015 22:18:12 +0000 (15:18 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Sat, 23 May 2015 22:19:02 +0000 (15:19 -0700)
diff --git a/lib-src/etags.c b/lib-src/etags.c

index 28729da87200a86aeb3e040984909c0f642b90d7..facb462f67b8c906244a1627422939434f1e69f1 100644 (file)
--- a/lib-src/etags.c
+++ b/lib-src/etags.c
@@ -119,17 +119,18 @@ char pot_etags_version[] = "@(#) pot revision number is 17.38.1.4";
  # define O_CLOEXEC O_NOINHERIT
  #endif /* WINDOWSNT */
  
+#include <limits.h>
  #include <unistd.h>
  #include <stdarg.h>
  #include <stdlib.h>
  #include <string.h>
  #include <sysstdio.h>
-#include <ctype.h>
  #include <errno.h>
  #include <fcntl.h>
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <binary-io.h>
+#include <c-ctype.h>
  #include <c-strcase.h>
  
  #include <assert.h>
@@ -156,21 +157,71 @@ char pot_etags_version[] = "@(#) pot revision number is 17.38.1.4";
  #define strneq(s,t,n)  (assert ((s)!=NULL || (t)!=NULL), !strncmp (s, t, n))
  #define strncaseeq(s,t,n) (assert ((s)!=NULL && (t)!=NULL), !c_strncasecmp (s, t, n))
  
-#define CHARS 256              /* 2^sizeof(char) */
-#define CHAR(x)                ((unsigned int)(x) & (CHARS - 1))
-#define        iswhite(c)      (_wht[CHAR (c)]) /* c is white (see white) */
-#define notinname(c)   (_nin[CHAR (c)]) /* c is not in a name (see nonam) */
-#define        begtoken(c)     (_btk[CHAR (c)]) /* c can start token (see begtk) */
-#define        intoken(c)      (_itk[CHAR (c)]) /* c can be in token (see midtk) */
-#define        endtoken(c)     (_etk[CHAR (c)]) /* c ends tokens (see endtk) */
+/* C is not in a name.  */
+static bool
+notinname (unsigned char c)
+{
+  /* Look at make_tag before modifying!  */
+  static bool const table[UCHAR_MAX + 1] = {
+    ['\0']=1, ['\t']=1, ['\n']=1, ['\f']=1, ['\r']=1, [' ']=1,
+    ['(']=1, [')']=1, [',']=1, [';']=1, ['=']=1
+  };
+  return table[c];
+}
  
-#define ISALNUM(c)     isalnum (CHAR (c))
-#define ISALPHA(c)     isalpha (CHAR (c))
-#define ISDIGIT(c)     isdigit (CHAR (c))
-#define ISLOWER(c)     islower (CHAR (c))
+/* C can start a token.  */
+static bool
+begtoken (unsigned char c)
+{
+  static bool const table[UCHAR_MAX + 1] = {
+    ['$']=1, ['@']=1,
+    ['A']=1, ['B']=1, ['C']=1, ['D']=1, ['E']=1, ['F']=1, ['G']=1, ['H']=1,
+    ['I']=1, ['J']=1, ['K']=1, ['L']=1, ['M']=1, ['N']=1, ['O']=1, ['P']=1,
+    ['Q']=1, ['R']=1, ['S']=1, ['T']=1, ['U']=1, ['V']=1, ['W']=1, ['X']=1,
+    ['Y']=1, ['Z']=1,
+    ['_']=1,
+    ['a']=1, ['b']=1, ['c']=1, ['d']=1, ['e']=1, ['f']=1, ['g']=1, ['h']=1,
+    ['i']=1, ['j']=1, ['k']=1, ['l']=1, ['m']=1, ['n']=1, ['o']=1, ['p']=1,
+    ['q']=1, ['r']=1, ['s']=1, ['t']=1, ['u']=1, ['v']=1, ['w']=1, ['x']=1,
+    ['y']=1, ['z']=1,
+    ['~']=1
+  };
+  return table[c];
+}
  
-#define lowcase(c)     tolower (CHAR (c))
+/* C can be in the middle of a token.  */
+static bool
+intoken (unsigned char c)
+{
+  static bool const table[UCHAR_MAX + 1] = {
+    ['0']=1, ['1']=1, ['2']=1, ['3']=1, ['4']=1,
+    ['5']=1, ['6']=1, ['7']=1, ['8']=1, ['9']=1,
+    ['A']=1, ['B']=1, ['C']=1, ['D']=1, ['E']=1, ['F']=1, ['G']=1, ['H']=1,
+    ['I']=1, ['J']=1, ['K']=1, ['L']=1, ['M']=1, ['N']=1, ['O']=1, ['P']=1,
+    ['Q']=1, ['R']=1, ['S']=1, ['T']=1, ['U']=1, ['V']=1, ['W']=1, ['X']=1,
+    ['Y']=1, ['Z']=1,
+    ['_']=1,
+    ['a']=1, ['b']=1, ['c']=1, ['d']=1, ['e']=1, ['f']=1, ['g']=1, ['h']=1,
+    ['i']=1, ['j']=1, ['k']=1, ['l']=1, ['m']=1, ['n']=1, ['o']=1, ['p']=1,
+    ['q']=1, ['r']=1, ['s']=1, ['t']=1, ['u']=1, ['v']=1, ['w']=1, ['x']=1,
+    ['y']=1, ['z']=1
+  };
+  return table[c];
+}
  
+/* C can end a token.  */
+static bool
+endtoken (unsigned char c)
+{
+  static bool const table[UCHAR_MAX + 1] = {
+    ['\0']=1, ['\t']=1, ['\n']=1, ['\r']=1, [' ']=1,
+    ['!']=1, ['"']=1, ['#']=1, ['%']=1, ['&']=1, ['\'']=1, ['(']=1, [')']=1,
+    ['*']=1, ['+']=1, [',']=1, ['-']=1, ['.']=1, ['/']=1, [':']=1, [';']=1,
+    ['<']=1, ['=']=1, ['>']=1, ['?']=1, ['[']=1, [']']=1, ['^']=1,
+    ['{']=1, ['|']=1, ['}']=1, ['~']=1
+  };
+  return table[c];
+}
  
  /*
   *     xnew, xrnew -- allocate, reallocate storage
@@ -316,7 +367,6 @@ _Noreturn void fatal (const char *, const char *);
  static _Noreturn void pfatal (const char *);
  static void add_node (node *, node **);
  
-static void init (void);
  static void process_file_name (char *, language *);
  static void process_file (FILE *, char *, language *);
  static void find_entries (FILE *);
@@ -370,20 +420,6 @@ static linebuffer lb;              /* the current line */
  static linebuffer filebuf;     /* a buffer containing the whole file */
  static linebuffer token_name;  /* a buffer containing a tag name */
  
-/* boolean "functions" (see init)      */
-static bool _wht[CHARS], _nin[CHARS], _itk[CHARS], _btk[CHARS], _etk[CHARS];
-static const char
-  /* white chars */
-  *white = " \f\t\n\r\v",
-  /* not in a name */
-  *nonam = " \f\t\n\r()=,;",   /* look at make_tag before modifying! */
-  /* token ending chars */
-  *endtk = " \t\n\r\"'#()[]{}=-+%*/&|^~!<>;,.:?",
-  /* token starting chars */
-  *begtk = "ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz$~@",
-  /* valid in-token chars */
-  *midtk = "ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz$0123456789";
-
  static bool append_to_tagfile; /* -a: append to tags */
  /* The next five default to true in C and derived languages.  */
  static bool typedefs;          /* -t: create tags for C and Ada typedefs */
@@ -1167,8 +1203,6 @@ main (int argc, char **argv)
        tagfiledir = absolute_dirname (tagfile, cwd);
      }
  
-  init ();                     /* set up boolean "functions" */
-
    linebuffer_init (&lb);
    linebuffer_init (&filename_lb);
    linebuffer_init (&filebuf);
@@ -1636,34 +1670,6 @@ process_file (FILE *fh, char *fn, language *lang)
      }
  }
  
-/*
- * This routine sets up the boolean pseudo-functions which work
- * by setting boolean flags dependent upon the corresponding character.
- * Every char which is NOT in that string is not a white char.  Therefore,
- * all of the array "_wht" is set to false, and then the elements
- * subscripted by the chars in "white" are set to true.  Thus "_wht"
- * of a char is true if it is the string "white", else false.
- */
-static void
-init (void)
-{
-  const char *sp;
-  int i;
-
-  for (i = 0; i < CHARS; i++)
-    iswhite (i) = notinname (i) = begtoken (i) = intoken (i) = endtoken (i)
-      = false;
-  for (sp = white; *sp != '\0'; sp++) iswhite (*sp) = true;
-  for (sp = nonam; *sp != '\0'; sp++) notinname (*sp) = true;
-  notinname ('\0') = notinname ('\n');
-  for (sp = begtk; *sp != '\0'; sp++) begtoken (*sp) = true;
-  begtoken ('\0') = begtoken ('\n');
-  for (sp = midtk; *sp != '\0'; sp++) intoken (*sp) = true;
-  intoken ('\0') = intoken ('\n');
-  for (sp = endtk; *sp != '\0'; sp++) endtoken (*sp) = true;
-  endtoken ('\0') = endtoken ('\n');
-}
-
  /*
   * This routine opens the specified file and calls the function
   * which finds the function and type definitions.
@@ -3179,7 +3185,7 @@ C_entries (int c_ext, FILE *inf)
                  followed by an end of comment, this is a preprocessor
                  token. */
               for (cp = newlb.buffer; cp < lp-1; cp++)
-               if (!iswhite (*cp))
+               if (!c_isspace (*cp))
                   {
                     if (*cp == '*' && cp[1] == '/')
                       {
@@ -3262,7 +3268,7 @@ C_entries (int c_ext, FILE *inf)
                               if (*lp != '\0')
                                 lp += 1;
                               while (*lp != '\0'
-                                    && !iswhite (*lp) && *lp != '(')
+                                    && !c_isspace (*lp) && *lp != '(')
                                 lp += 1;
                               c = *lp++;
                               toklen += lp - oldlp;
@@ -3934,14 +3940,14 @@ F_takeprec (void)
        dbp += 3;
        return;
      }
-  if (!ISDIGIT (*dbp))
+  if (!c_isdigit (*dbp))
      {
        --dbp;                   /* force failure */
        return;
      }
    do
      dbp++;
-  while (ISDIGIT (*dbp));
+  while (c_isdigit (*dbp));
  }
  
  static void
@@ -3959,7 +3965,7 @@ F_getit (FILE *inf)
        dbp += 6;
        dbp = skip_spaces (dbp);
      }
-  if (!ISALPHA (*dbp) && *dbp != '_' && *dbp != '$')
+  if (!c_isalpha (*dbp) && *dbp != '_' && *dbp != '$')
      return;
    for (cp = dbp + 1; *cp != '\0' && intoken (*cp); cp++)
      continue;
@@ -3988,7 +3994,7 @@ Fortran_functions (FILE *inf)
        if (LOOKING_AT_NOCASE (dbp, "elemental"))
         dbp = skip_spaces (dbp);
  
-      switch (lowcase (*dbp))
+      switch (c_tolower (*dbp))
         {
         case 'i':
           if (nocase_tail ("integer"))
@@ -4021,7 +4027,7 @@ Fortran_functions (FILE *inf)
        dbp = skip_spaces (dbp);
        if (*dbp == '\0')
         continue;
-      switch (lowcase (*dbp))
+      switch (c_tolower (*dbp))
         {
         case 'f':
           if (nocase_tail ("function"))
@@ -4075,7 +4081,7 @@ Ada_getit (FILE *inf, const char *name_qualifier)
           readline (&lb, inf);
           dbp = lb.buffer;
         }
-      switch (lowcase (*dbp))
+      switch (c_tolower (*dbp))
          {
          case 'b':
            if (nocase_tail ("body"))
@@ -4102,8 +4108,7 @@ Ada_getit (FILE *inf, const char *name_qualifier)
         {
           dbp = skip_spaces (dbp);
           for (cp = dbp;
-              (*cp != '\0'
-               && (ISALPHA (*cp) || ISDIGIT (*cp) || *cp == '_' || *cp == '.'));
+              c_isalnum (*cp) || *cp == '_' || *cp == '.';
                cp++)
             continue;
           if (cp == dbp)
@@ -4179,7 +4184,7 @@ Ada_funcs (FILE *inf)
             }
  
           /* We are at the beginning of a token. */
-         switch (lowcase (*dbp))
+         switch (c_tolower (*dbp))
             {
             case 'f':
               if (!packages_only && nocase_tail ("function"))
@@ -4244,13 +4249,13 @@ Asm_labels (FILE *inf)
      {
        /* If first char is alphabetic or one of [_.$], test for colon
          following identifier. */
-      if (ISALPHA (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
+      if (c_isalpha (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
         {
           /* Read past label. */
           cp++;
-         while (ISALNUM (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
+         while (c_isalnum (*cp) || *cp == '_' || *cp == '.' || *cp == '$')
             cp++;
-         if (*cp == ':' || iswhite (*cp))
+         if (*cp == ':' || c_isspace (*cp))
             /* Found end of label, so copy it and add it to the table. */
             make_tag (lb.buffer, cp - lb.buffer, true,
                       lb.buffer, cp - lb.buffer + 1, lineno, linecharno);
@@ -4338,7 +4343,7 @@ Perl_functions (FILE *inf)
               varstart += 1;
               do
                 cp++;
-             while (ISALNUM (*cp) || *cp == '_');
+             while (c_isalnum (*cp) || *cp == '_');
             }
           else if (qual)
             {
@@ -4433,7 +4438,7 @@ PHP_functions (FILE *inf)
           if (*cp != '\0')
             {
               name = cp;
-             while (*cp != '\0' && !iswhite (*cp))
+             while (*cp != '\0' && !c_isspace (*cp))
                 cp++;
               make_tag (name, cp - name, false,
                         lb.buffer, cp - lb.buffer + 1, lineno, linecharno);
@@ -4485,10 +4490,10 @@ Cobol_paragraphs (FILE *inf)
        bp += 8;
  
        /* If eoln, compiler option or comment ignore whole line. */
-      if (bp[-1] != ' ' || !ISALNUM (bp[0]))
+      if (bp[-1] != ' ' || !c_isalnum (bp[0]))
          continue;
  
-      for (ep = bp; ISALNUM (*ep) || *ep == '-'; ep++)
+      for (ep = bp; c_isalnum (*ep) || *ep == '-'; ep++)
         continue;
        if (*ep++ == '.')
         make_tag (bp, ep - bp, true,
@@ -4636,7 +4641,7 @@ Pascal_functions (FILE *inf)
           /* Check if this is an "extern" declaration. */
           if (*dbp == '\0')
             continue;
-         if (lowcase (*dbp) == 'e')
+         if (c_tolower (*dbp) == 'e')
             {
               if (nocase_tail ("extern")) /* superfluous, really! */
                 {
@@ -4644,7 +4649,7 @@ Pascal_functions (FILE *inf)
                   verify_tag = false;
                 }
             }
-         else if (lowcase (*dbp) == 'f')
+         else if (c_tolower (*dbp) == 'f')
             {
               if (nocase_tail ("forward")) /* check for forward reference */
                 {
@@ -4691,7 +4696,7 @@ Pascal_functions (FILE *inf)
        else if (!incomment && !inquote && !found_tag)
         {
           /* Check for proc/fn keywords. */
-         switch (lowcase (c))
+         switch (c_tolower (c))
             {
             case 'p':
               if (nocase_tail ("rocedure")) /* c = 'p', dbp has advanced */
@@ -4855,13 +4860,13 @@ Forth_words (FILE *inf)
  
    LOOP_ON_INPUT_LINES (inf, lb, bp)
      while ((bp = skip_spaces (bp))[0] != '\0')
-      if (bp[0] == '\\' && iswhite (bp[1]))
+      if (bp[0] == '\\' && c_isspace (bp[1]))
         break;                  /* read next line */
-      else if (bp[0] == '(' && iswhite (bp[1]))
+      else if (bp[0] == '(' && c_isspace (bp[1]))
         do                      /* skip to ) or eol */
           bp++;
         while (*bp != ')' && *bp != '\0');
-      else if ((bp[0] == ':' && iswhite (bp[1]) && bp++)
+      else if ((bp[0] == ':' && c_isspace (bp[1]) && bp++)
                || LOOKING_AT_NOCASE (bp, "constant")
                || LOOKING_AT_NOCASE (bp, "code")
                || LOOKING_AT_NOCASE (bp, "create")
@@ -4972,7 +4977,7 @@ TeX_commands (FILE *inf)
                     cp++;
                   }
                 for (p = cp;
-                    (!iswhite (*p) && *p != '#' &&
+                    (!c_isspace (*p) && *p != '#' &&
                       *p != TEX_opgrp && *p != TEX_clgrp);
                      p++)
                   continue;
@@ -5130,7 +5135,7 @@ HTML_labels (FILE *inf)
         else if (intag) /* look for "name=" or "id=" */
           {
             while (*dbp != '\0' && *dbp != '>'
-                  && lowcase (*dbp) != 'n' && lowcase (*dbp) != 'i')
+                  && c_tolower (*dbp) != 'n' && c_tolower (*dbp) != 'i')
               dbp++;
             if (*dbp == '\0')
               break;            /* go to next line */
@@ -5172,7 +5177,7 @@ HTML_labels (FILE *inf)
             if (*dbp == '<')
               {
                 intag = true;
-               inanchor = (lowcase (dbp[1]) == 'a' && !intoken (dbp[2]));
+               inanchor = (c_tolower (dbp[1]) == 'a' && !intoken (dbp[2]));
                 continue;       /* look on the same line */
               }
  
@@ -5192,7 +5197,7 @@ HTML_labels (FILE *inf)
             if (*dbp == '\0')
               break;            /* go to next line */
             intag = true;
-           if (lowcase (dbp[1]) == 'a' && !intoken (dbp[2]))
+           if (c_tolower (dbp[1]) == 'a' && !intoken (dbp[2]))
               {
                 inanchor = true;
                 continue;       /* look on the same line */
@@ -5239,7 +5244,7 @@ Prolog_functions (FILE *inf)
      {
        if (cp[0] == '\0')       /* Empty line */
         continue;
-      else if (iswhite (cp[0])) /* Not a predicate */
+      else if (c_isspace (cp[0])) /* Not a predicate */
         continue;
        else if (cp[0] == '/' && cp[1] == '*')   /* comment. */
         prolog_skip_comment (&lb, inf);
@@ -5331,11 +5336,11 @@ prolog_atom (char *s, size_t pos)
  
    origpos = pos;
  
-  if (ISLOWER (s[pos]) || (s[pos] == '_'))
+  if (c_islower (s[pos]) || s[pos] == '_')
      {
        /* The atom is unquoted. */
        pos++;
-      while (ISALNUM (s[pos]) || (s[pos] == '_'))
+      while (c_isalnum (s[pos]) || s[pos] == '_')
         {
           pos++;
         }
@@ -5399,7 +5404,7 @@ Erlang_functions (FILE *inf)
      {
        if (cp[0] == '\0')       /* Empty line */
         continue;
-      else if (iswhite (cp[0])) /* Not function nor attribute */
+      else if (c_isspace (cp[0])) /* Not function nor attribute */
         continue;
        else if (cp[0] == '%')   /* comment */
         continue;
@@ -5506,12 +5511,12 @@ erlang_atom (char *s)
  {
    int pos = 0;
  
-  if (ISALPHA (s[pos]) || s[pos] == '_')
+  if (c_isalpha (s[pos]) || s[pos] == '_')
      {
        /* The atom is unquoted. */
        do
         pos++;
-      while (ISALNUM (s[pos]) || s[pos] == '_');
+      while (c_isalnum (s[pos]) || s[pos] == '_');
      }
    else if (s[pos] == '\'')
      {
@@ -5729,10 +5734,10 @@ add_regex (char *regexp_pattern, language *lang)
    *patbuf = zeropattern;
    if (ignore_case)
      {
-      static char lc_trans[CHARS];
+      static char lc_trans[UCHAR_MAX + 1];
        int i;
-      for (i = 0; i < CHARS; i++)
-       lc_trans[i] = lowcase (i);
+      for (i = 0; i < UCHAR_MAX + 1; i++)
+       lc_trans[i] = c_tolower (i);
        patbuf->translate = lc_trans;    /* translation table to fold case  */
      }
  
@@ -5787,7 +5792,7 @@ substitute (char *in, char *out, struct re_registers *regs)
    for (t = strchr (out, '\\');
         t != NULL;
         t = strchr (t + 2, '\\'))
-    if (ISDIGIT (t[1]))
+    if (c_isdigit (t[1]))
        {
         dig = t[1] - '0';
         diglen = regs->end[dig] - regs->start[dig];
@@ -5801,7 +5806,7 @@ substitute (char *in, char *out, struct re_registers *regs)
    result = xnew (size + 1, char);
  
    for (t = result; *out != '\0'; out++)
-    if (*out == '\\' && ISDIGIT (*++out))
+    if (*out == '\\' && c_isdigit (*++out))
        {
         dig = *out - '0';
         diglen = regs->end[dig] - regs->start[dig];
@@ -5921,9 +5926,9 @@ regex_tag_multiline (void)
  static bool
  nocase_tail (const char *cp)
  {
-  register int len = 0;
+  int len = 0;
  
-  while (*cp != '\0' && lowcase (*cp) == lowcase (dbp[len]))
+  while (*cp != '\0' && c_tolower (*cp) == c_tolower (dbp[len]))
      cp++, len++;
    if (*cp == '\0' && !intoken (dbp[len]))
      {
@@ -6257,7 +6262,7 @@ savenstr (const char *cp, int len)
  static char *
  skip_spaces (char *cp)
  {
-  while (iswhite (*cp))
+  while (c_isspace (*cp))
      cp++;
    return cp;
  }
@@ -6266,7 +6271,7 @@ skip_spaces (char *cp)
  static char *
  skip_non_spaces (char *cp)
  {
-  while (*cp != '\0' && !iswhite (*cp))
+  while (*cp != '\0' && !c_isspace (*cp))
      cp++;
    return cp;
  }
@@ -6527,7 +6532,7 @@ filename_is_absolute (char *fn)
  {
    return (fn[0] == '/'
  #ifdef DOS_NT
-         || (ISALPHA (fn[0]) && fn[1] == ':' && fn[2] == '/')
+         || (c_isalpha (fn[0]) && fn[1] == ':' && fn[2] == '/')
  #endif
           );
  }
@@ -6541,9 +6546,8 @@ canonicalize_filename (register char *fn)
  
  #ifdef DOS_NT
    /* Canonicalize drive letter case.  */
-# define ISUPPER(c)    isupper (CHAR (c))
-  if (fn[0] != '\0' && fn[1] == ':' && ISUPPER (fn[0]))
-    fn[0] = lowcase (fn[0]);
+  if (c_isupper (fn[0]) && fn[1] == ':')
+    fn[0] = c_tolower (fn[0]);
  
    /* Collapse multiple forward- and back-slashes into a single forward
       slash. */
author	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 23 May 2015 22:18:12 +0000 (15:18 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 23 May 2015 22:19:02 +0000 (15:19 -0700)