From 0168c3d80919476f43fcf4f4ae64fc18bf6365a7 Mon Sep 17 00:00:00 2001 From: Kenichi Handa Date: Fri, 1 Mar 2002 01:15:38 +0000 Subject: [PATCH] New file. --- src/character.c | 917 ++++++++++++++++++++++++++++++++++++++++++++++++ src/character.h | 530 ++++++++++++++++++++++++++++ 2 files changed, 1447 insertions(+) create mode 100644 src/character.c create mode 100644 src/character.h diff --git a/src/character.c b/src/character.c new file mode 100644 index 00000000000..51527ee7dbb --- /dev/null +++ b/src/character.c @@ -0,0 +1,917 @@ +/* Basic character support. + Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN. + Licensed to the Free Software Foundation. + Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001, 2002 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H13PRO009 + +This file is part of GNU Emacs. + +GNU Emacs is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Emacs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Emacs; see the file COPYING. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/* At first, see the document in `character.h' to understand the code + in this file. */ + +#ifdef emacs +#include +#endif + +#include + +#ifdef emacs + +#include +#include "lisp.h" +#include "character.h" +#include "buffer.h" +#include "charset.h" +#include "composite.h" +#include "disptab.h" + +#else /* not emacs */ + +#include "mulelib.h" + +#endif /* emacs */ + +Lisp_Object Qcharacterp; + +/* Vector of translation table ever defined. + ID of a translation table is used to index this vector. */ +Lisp_Object Vtranslation_table_vector; + +/* A char-table for characters which may invoke auto-filling. */ +Lisp_Object Vauto_fill_chars; + +Lisp_Object Qauto_fill_chars; + +Lisp_Object Vchar_unify_table; + +/* A char-table. An element is non-nil iff the corresponding + character has a printable glyph. */ +Lisp_Object Vprintable_chars; + +/* A char-table. An elemnent is a column-width of the corresponding + character. */ +Lisp_Object Vchar_width_table; + +/* A char-table. An element is a symbol indicating the direction + property of corresponding character. */ +Lisp_Object Vchar_direction_table; + +/* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */ +unsigned char *_fetch_multibyte_char_p; +int _fetch_multibyte_char_len; + + + +int +char_string_with_unification (c, p, advanced) + int c; + unsigned char *p, **advanced; +{ + int bytes; + + MAYBE_UNIFY_CHAR (c); + + if (c <= MAX_3_BYTE_CHAR || c > MAX_5_BYTE_CHAR) + { + bytes = CHAR_STRING (c, p); + } + else if (c <= MAX_4_BYTE_CHAR) + { + p[0] = (0xF0 | (c >> 18)); + p[1] = (0x80 | ((c >> 12) & 0x3F)); + p[2] = (0x80 | ((c >> 6) & 0x3F)); + p[3] = (0x80 | (c & 0x3F)); + bytes = 4; + } + else + { + p[0] = 0xF8; + p[1] = (0x80 | ((c >> 18) & 0x0F)); + p[2] = (0x80 | ((c >> 12) & 0x3F)); + p[3] = (0x80 | ((c >> 6) & 0x3F)); + p[4] = (0x80 | (c & 0x3F)); + bytes = 5; + } + if (advanced) + *advanced = p + bytes; + return bytes; +} + + +int +string_char_with_unification (p, advanced, len) + unsigned char *p, **advanced; + int *len; +{ + int c, unified; + unsigned char *saved_p = p; + + if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10)) + { + c = STRING_CHAR_ADVANCE (p); + } + else if (! (*p & 0x08)) + { + c = ((((p)[0] & 0xF) << 18) + | (((p)[1] & 0x3F) << 12) + | (((p)[2] & 0x3F) << 6) + | ((p)[3] & 0x3F)); + p += 4; + } + else + { + c = ((((p)[1] & 0x3F) << 18) + | (((p)[2] & 0x3F) << 12) + | (((p)[3] & 0x3F) << 6) + | ((p)[4] & 0x3F)); + p += 5; + } + + MAYBE_UNIFY_CHAR (c); + + if (len) + *len = p - saved_p; + if (advanced) + *advanced = p; + return c; +} + + +/* Translate character C by translation table TABLE. If C is + negative, translate a character specified by CHARSET and CODE. If + no translation is found in TABLE, return the untranslated + character. */ + +int +translate_char (table, c) + Lisp_Object table; + int c; +{ + Lisp_Object ch; + + if (! CHAR_TABLE_P (table)) + return c; + ch = CHAR_TABLE_REF (table, c); + if (! CHARACTERP (ch)) + return c; + return XINT (ch); +} + +/* Convert the unibyte character C to the corresponding multibyte + character based on the current value of charset_primary. If C + can't be converted, return C. */ + +int +unibyte_char_to_multibyte (c) + int c; +{ + struct charset *charset = CHARSET_FROM_ID (charset_primary); + int c1 = DECODE_CHAR (charset, c); + + return ((c1 >= 0) ? c1 : c); +} + + +/* Convert the multibyte character C to unibyte 8-bit character based + on the current value of charset_primary. If dimension of + charset_primary is more than one, return (C & 0xFF). + + The argument REV_TBL is now ignored. It will be removed in the + future. */ + +int +multibyte_char_to_unibyte (c, rev_tbl) + int c; + Lisp_Object rev_tbl; +{ + struct charset *charset = CHARSET_FROM_ID (charset_primary); + unsigned c1 = ENCODE_CHAR (charset, c); + + return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF); +} + + +DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0, + doc: /* Return non-nil if OBJECT is a character. */) + (object, ignore) + Lisp_Object object, ignore; +{ + return (CHARACTERP (object) ? Qt : Qnil); +} + +DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0, + doc: /* Return the character of the maximum code. */) + () +{ + return make_number (MAX_CHAR); +} + +DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte, + Sunibyte_char_to_multibyte, 1, 1, 0, + doc: /* Convert the unibyte character CH to multibyte character. +The multibyte character is a result of decoding CH by +the current primary charset (value of `charset-primary'). */) + (ch) + Lisp_Object ch; +{ + int c; + struct charset *charset; + + CHECK_CHARACTER (ch); + c = XFASTINT (ch); + if (c >= 0400) + error ("Invalid unibyte character: %d", c); + charset = CHARSET_FROM_ID (charset_primary); + c = DECODE_CHAR (charset, c); + if (c < 0) + error ("Can't convert to multibyte character: %d", XINT (ch)); + return make_number (c); +} + +DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte, + Smultibyte_char_to_unibyte, 1, 1, 0, + doc: /* Convert the multibyte character CH to unibyte character.\n\ +The unibyte character is a result of encoding CH by +the current primary charset (value of `charset-primary'). */) + (ch) + Lisp_Object ch; +{ + int c; + unsigned code; + struct charset *charset; + + CHECK_CHARACTER (ch); + c = XFASTINT (ch); + charset = CHARSET_FROM_ID (charset_primary); + code = ENCODE_CHAR (charset, c); + if (code < CHARSET_MIN_CODE (charset) + || code > CHARSET_MAX_CODE (charset)) + error ("Can't convert to unibyte character: %d", XINT (ch)); + return make_number (code); +} + +DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0, + doc: /* Return 1 regardless of the argument CHAR. +This is now an obsolete function. We keep it just for backward compatibility. */) + (ch) + Lisp_Object ch; +{ + CHECK_CHARACTER (ch); + return make_number (1); +} + +DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0, + doc: /* Return width of CHAR when displayed in the current buffer. +The width is measured by how many columns it occupies on the screen. +Tab is taken to occupy `tab-width' columns. */) + (ch) + Lisp_Object ch; +{ + Lisp_Object disp; + int c, width; + struct Lisp_Char_Table *dp = buffer_display_table (); + + CHECK_CHARACTER (ch); + c = XINT (ch); + + /* Get the way the display table would display it. */ + disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil; + + if (VECTORP (disp)) + width = ASIZE (disp); + else + width = CHAR_WIDTH (c); + + return make_number (width); +} + +/* Return width of string STR of length LEN when displayed in the + current buffer. The width is measured by how many columns it + occupies on the screen. */ + +int +strwidth (str, len) + unsigned char *str; + int len; +{ + return c_string_width (str, len, -1, NULL, NULL); +} + +/* Return width of string STR of length LEN when displayed in the + current buffer. The width is measured by how many columns it + occupies on the screen. If PRECISION > 0, return the width of + longest substring that doesn't exceed PRECISION, and set number of + characters and bytes of the substring in *NCHARS and *NBYTES + respectively. */ + +c_string_width (str, len, precision, nchars, nbytes) + unsigned char *str; + int precision, *nchars, *nbytes; +{ + int i = 0, i_byte = 0; + int width = 0; + struct Lisp_Char_Table *dp = buffer_display_table (); + + while (i_byte < len) + { + int bytes, thiswidth; + Lisp_Object val; + int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + + if (dp) + { + val = DISP_CHAR_VECTOR (dp, c); + if (VECTORP (val)) + thiswidth = XVECTOR (val)->size; + else + thiswidth = CHAR_WIDTH (c); + } + else + { + thiswidth = CHAR_WIDTH (c); + } + + if (precision > 0 + && (width + thiswidth > precision)) + { + *nchars = i; + *nbytes = i_byte; + return width; + } + i++; + i_byte += bytes; + width += thiswidth; + } + + if (precision > 0) + { + *nchars = i; + *nbytes = i_byte; + } + + return width; +} + +/* Return width of Lisp string STRING when displayed in the current + buffer. The width is measured by how many columns it occupies on + the screen while paying attention to compositions. If PRECISION > + 0, return the width of longest substring that doesn't exceed + PRECISION, and set number of characters and bytes of the substring + in *NCHARS and *NBYTES respectively. */ + +int +lisp_string_width (string, precision, nchars, nbytes) + Lisp_Object string; + int precision, *nchars, *nbytes; +{ + int len = XSTRING (string)->size; + int len_byte = STRING_BYTES (XSTRING (string)); + unsigned char *str = XSTRING (string)->data; + int i = 0, i_byte = 0; + int width = 0; + struct Lisp_Char_Table *dp = buffer_display_table (); + + while (i < len) + { + int chars, bytes, thiswidth; + Lisp_Object val; + int cmp_id; + int ignore, end; + + if (find_composition (i, -1, &ignore, &end, &val, string) + && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string)) + >= 0)) + { + thiswidth = composition_table[cmp_id]->width; + chars = end - i; + bytes = string_char_to_byte (string, end) - i_byte; + } + else if (dp) + { + int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + + chars = 1; + val = DISP_CHAR_VECTOR (dp, c); + if (VECTORP (val)) + thiswidth = XVECTOR (val)->size; + else + thiswidth = CHAR_WIDTH (c); + } + else + { + int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + + chars = 1; + thiswidth = CHAR_WIDTH (c); + } + + if (precision > 0 + && (width + thiswidth > precision)) + { + *nchars = i; + *nbytes = i_byte; + return width; + } + i += chars; + i_byte += bytes; + width += thiswidth; + } + + if (precision > 0) + { + *nchars = i; + *nbytes = i_byte; + } + + return width; +} + +DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0, + doc: /* Return width of STRING when displayed in the current buffer. +Width is measured by how many columns it occupies on the screen. +When calculating width of a multibyte character in STRING, +only the base leading-code is considered; the validity of +the following bytes is not checked. Tabs in STRING are always +taken to occupy `tab-width' columns. */) + (str) + Lisp_Object str; +{ + Lisp_Object val; + + CHECK_STRING (str); + XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL)); + return val; +} + +DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0, + doc: /* Return the direction of CHAR. +The returned value is 0 for left-to-right and 1 for right-to-left. */) + (ch) + Lisp_Object ch; +{ + int c; + + CHECK_CHARACTER (ch); + c = XINT (ch); + return CHAR_TABLE_REF (Vchar_direction_table, c); +} + +DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0, + doc: /* Return number of characters between BEG and END. +This is now an obsolete function. We keep it just for backward compatibility. */) + (beg, end) + Lisp_Object beg, end; +{ + int from, to; + + CHECK_NUMBER_COERCE_MARKER (beg); + CHECK_NUMBER_COERCE_MARKER (end); + + from = min (XFASTINT (beg), XFASTINT (end)); + to = max (XFASTINT (beg), XFASTINT (end)); + + return make_number (to - from); +} + +/* Return the number of characters in the NBYTES bytes at PTR. + This works by looking at the contents and checking for multibyte + sequences while assuming that there's no invalid sequence. + However, if the current buffer has enable-multibyte-characters = + nil, we treat each byte as a character. */ + +int +chars_in_text (ptr, nbytes) + unsigned char *ptr; + int nbytes; +{ + /* current_buffer is null at early stages of Emacs initialization. */ + if (current_buffer == 0 + || NILP (current_buffer->enable_multibyte_characters)) + return nbytes; + + return multibyte_chars_in_text (ptr, nbytes); +} + +/* Return the number of characters in the NBYTES bytes at PTR. + This works by looking at the contents and checking for multibyte + sequences while assuming that there's no invalid sequence. It + ignores enable-multibyte-characters. */ + +int +multibyte_chars_in_text (ptr, nbytes) + unsigned char *ptr; + int nbytes; +{ + unsigned char *endp = ptr + nbytes; + int chars = 0; + + while (ptr < endp) + { + int len = MULTIBYTE_LENGTH (ptr, endp); + + if (len == 0) + abort (); + ptr += len; + chars++; + } + + return chars; +} + +/* Parse unibyte text at STR of LEN bytes as a multibyte text, count + characters and bytes in it, and store them in *NCHARS and *NBYTES + respectively. On counting bytes, pay attention to that 8-bit + characters not constructing a valid multibyte sequence are + represented by 2-byte in a multibyte text. */ + +void +parse_str_as_multibyte (str, len, nchars, nbytes) + unsigned char *str; + int len, *nchars, *nbytes; +{ + unsigned char *endp = str + len; + int n, chars = 0, bytes = 0; + + if (len >= MAX_MULTIBYTE_LENGTH) + { + unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; + while (str < adjusted_endp) + { + if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0) + str += n, bytes += n; + else + str++, bytes += 2; + chars++; + } + } + while (str < endp) + { + if ((n = MULTIBYTE_LENGTH (str, endp)) > 0) + str += n, bytes += n; + else + str++, bytes += 2; + chars++; + } + + *nchars = chars; + *nbytes = bytes; + return; +} + +/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text. + It actually converts only such 8-bit characters that don't contruct + a multibyte sequence to multibyte forms of Latin-1 characters. If + NCHARS is nonzero, set *NCHARS to the number of characters in the + text. It is assured that we can use LEN bytes at STR as a work + area and that is enough. Return the number of bytes of the + resulting text. */ + +int +str_as_multibyte (str, len, nbytes, nchars) + unsigned char *str; + int len, nbytes, *nchars; +{ + unsigned char *p = str, *endp = str + nbytes; + unsigned char *to; + int chars = 0; + int n; + + if (nbytes >= MAX_MULTIBYTE_LENGTH) + { + unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; + while (p < adjusted_endp + && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) + p += n, chars++; + } + while ((n = MULTIBYTE_LENGTH (p, endp)) > 0) + p += n, chars++; + if (nchars) + *nchars = chars; + if (p == endp) + return nbytes; + + to = p; + nbytes = endp - p; + endp = str + len; + safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes); + p = endp - nbytes; + + if (nbytes >= MAX_MULTIBYTE_LENGTH) + { + unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; + while (p < adjusted_endp) + { + if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) + { + while (n--) + *to++ = *p++; + } + else + { + int c = *p++; + c = BYTE8_TO_CHAR (c); + to += CHAR_STRING (c, to); + } + } + chars++; + } + while (p < endp) + { + if ((n = MULTIBYTE_LENGTH (p, endp)) > 0) + { + while (n--) + *to++ = *p++; + } + else + { + int c = *p++; + c = BYTE8_TO_CHAR (c); + to += CHAR_STRING (c, to); + } + chars++; + } + if (nchars) + *nchars = chars; + return (to - str); +} + +/* Parse unibyte string at STR of LEN bytes, and return the number of + bytes it may ocupy when converted to multibyte string by + `str_to_multibyte'. */ + +int +parse_str_to_multibyte (str, len) + unsigned char *str; + int len; +{ + unsigned char *endp = str + len; + int bytes; + + for (bytes = 0; str < endp; str++) + bytes += (*str < 0x80) ? 1 : 2; + return bytes; +} + + +/* Convert unibyte text at STR of NBYTES bytes to a multibyte text + that contains the same single-byte characters. It actually + converts all 8-bit characters to multibyte forms. It is assured + that we can use LEN bytes at STR as a work area and that is + enough. */ + +int +str_to_multibyte (str, len, bytes) + unsigned char *str; + int len, bytes; +{ + unsigned char *p = str, *endp = str + bytes; + unsigned char *to; + + while (p < endp && *p < 0x80) p++; + if (p == endp) + return bytes; + to = p; + bytes = endp - p; + endp = str + len; + safe_bcopy ((char *) p, (char *) (endp - bytes), bytes); + p = endp - bytes; + while (p < endp) + { + int c = *p++; + + if (c >= 0x80) + c = BYTE8_TO_CHAR (c); + to += CHAR_STRING (c, to); + } + return (to - str); +} + +/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It + actually converts characters in the range 0x80..0xFF to + unibyte. */ + +int +str_as_unibyte (str, bytes) + unsigned char *str; + int bytes; +{ + unsigned char *p = str, *endp = str + bytes; + unsigned char *to = str; + int c, len; + + while (p < endp) + { + c = *p; + len = BYTES_BY_CHAR_HEAD (c); + if (CHAR_BYTE8_HEAD_P (c)) + break; + p += len; + } + to = p; + while (p < endp) + { + c = *p; + len = BYTES_BY_CHAR_HEAD (c); + if (CHAR_BYTE8_HEAD_P (c)) + { + c = STRING_CHAR_ADVANCE (p); + *to++ = CHAR_TO_BYTE8 (c); + } + else + { + while (len--) *to++ = *p++; + } + } + return (to - str); +} + +int +string_count_byte8 (string) + Lisp_Object string; +{ + int multibyte = STRING_MULTIBYTE (string); + int nchars = XSTRING (string)->size; + int nbytes = STRING_BYTES (XSTRING (string)); + unsigned char *p = XSTRING (string)->data; + unsigned char *pend = p + nbytes; + int count = 0; + int c, len; + + if (multibyte) + while (p < pend) + { + c = *p; + len = BYTES_BY_CHAR_HEAD (c); + + if (CHAR_BYTE8_HEAD_P (c)) + count++; + p += len; + } + else + while (p < pend) + { + if (*p++ >= 0x80) + count++; + } + return count; +} + + +Lisp_Object +string_escape_byte8 (string) + Lisp_Object string; +{ + int nchars = XSTRING (string)->size; + int nbytes = STRING_BYTES (XSTRING (string)); + int multibyte = STRING_MULTIBYTE (string); + int byte8_count; + unsigned char *src, *src_end, *dst; + Lisp_Object val; + int c, len; + + if (multibyte && nchars == nbytes) + return string; + + byte8_count = string_count_byte8 (string); + + if (byte8_count == 0) + return string; + + if (multibyte) + /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */ + val = make_uninit_multibyte_string (nchars + byte8_count * 2, + nbytes + byte8_count * 2); + else + /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */ + val = make_uninit_string (nbytes + byte8_count * 3); + + src = XSTRING (string)->data; + src_end = src + nbytes; + dst = XSTRING (val)->data; + if (multibyte) + while (src < src_end) + { + c = *src; + len = BYTES_BY_CHAR_HEAD (c); + + if (CHAR_BYTE8_HEAD_P (c)) + { + c = STRING_CHAR_ADVANCE (src); + c = CHAR_TO_BYTE8 (c); + sprintf (dst, "\\%03o", c); + dst += 4; + } + else + while (len--) *dst++ = *src++; + } + else + while (src < src_end) + { + c = *src++; + if (c >= 0x80) + { + sprintf (dst, "\\%03o", c); + dst += 4; + } + else + *dst++ = c; + } + return val; +} + + +DEFUN ("string", Fstring, Sstring, 1, MANY, 0, + doc: /* +Concatenate all the argument characters and make the result a string. */) + (n, args) + int n; + Lisp_Object *args; +{ + int i; + unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n); + unsigned char *p = buf; + int c; + + for (i = 0; i < n; i++) + { + CHECK_CHARACTER (args[i]); + c = XINT (args[i]); + p += CHAR_STRING (c, p); + } + + return make_string_from_bytes ((char *) buf, n, p - buf); +} + +void +init_character_once () +{ +} + +#ifdef emacs + +void +syms_of_character () +{ + DEFSYM (Qcharacterp, "characterp"); + DEFSYM (Qauto_fill_chars, "auto-fill-chars"); + + staticpro (&Vchar_unify_table); + Vchar_unify_table = Qnil; + + defsubr (&Smax_char); + defsubr (&Scharacterp); + defsubr (&Sunibyte_char_to_multibyte); + defsubr (&Smultibyte_char_to_unibyte); + defsubr (&Schar_bytes); + defsubr (&Schar_width); + defsubr (&Sstring_width); + defsubr (&Schar_direction); + defsubr (&Schars_in_region); + defsubr (&Sstring); + + DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector, + doc: /* +Vector of cons cell of a symbol and translation table ever defined. +An ID of a translation table is an index of this vector. */); + Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil); + + DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars, + doc: /* +A char-table for characters which invoke auto-filling. +Such characters have value t in this table. */); + Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil); + CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt); + CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt); + + DEFVAR_LISP ("char-width-table", &Vchar_width_table, + doc: /* +A char-table for width (columns) of each character. */); + Vchar_width_table = Fmake_char_table (Qnil, make_number (1)); + + DEFVAR_LISP ("char-direction-table", &Vchar_direction_table, + doc: /* A char-table for direction of each character. */); + Vchar_direction_table = Fmake_char_table (Qnil, make_number (1)); + + DEFVAR_LISP ("printable-chars", &Vprintable_chars, + doc: /* A char-table for each printable character. */); + Vprintable_chars = Fmake_char_table (Qnil, Qt); +} + +#endif /* emacs */ diff --git a/src/character.h b/src/character.h new file mode 100644 index 00000000000..ca9d0fdeb19 --- /dev/null +++ b/src/character.h @@ -0,0 +1,530 @@ +/* Header for multibyte character handler. + Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN. + Licensed to the Free Software Foundation. + Copyright (C) 2001, 2002 + National Institute of Advanced Industrial Science and Technology (AIST) + Registration Number H13PRO009 + +This file is part of GNU Emacs. + +GNU Emacs is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Emacs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Emacs; see the file COPYING. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#ifndef EMACS_CHARACTER_H +#define EMACS_CHARACTER_H + +/* 0-7F 0xxxxxxx + 00..7F + 80-7FF 110xxxxx 10xxxxxx + C2..DF 80..BF + 800-FFFF 1110xxxx 10xxxxxx 10xxxxxx + E0..EF 80..BF 80..BF + 10000-1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + F0..F7 80..BF 80..BF 80..BF + 200000-3FFF7F 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx + F8 80..8F 80..BF 80..BF 80..BF + invalid 11111001 + F9 + invalid 1111101x + FA..FB + invalid 111111xx + FC..FE + + raw-8-bit + 3FFF80-3FFFFF 1100000x 10xxxxxx + C0..C1 80..BF + +*/ + +/* This is the maximum character code ((1 << CHARACTERBITS) - 1). */ +#define MAX_CHAR 0x3FFFFF + +#define MAX_UNICODE_CHAR 0x10FFFF + +#define MAX_1_BYTE_CHAR 0x7F +#define MAX_2_BYTE_CHAR 0x7FF +#define MAX_3_BYTE_CHAR 0xFFFF +#define MAX_4_BYTE_CHAR 0x1FFFFF +#define MAX_5_BYTE_CHAR 0x3FFF7F + +#define BYTE8_TO_CHAR(byte) ((byte) + 0x3FFF00) +#define CHAR_TO_BYTE8(c) ((c) - 0x3FFF00) +#define CHAR_BYTE8_P(c) ((c) > MAX_5_BYTE_CHAR) +#define CHAR_BYTE8_HEAD_P(byte) ((byte) == 0xC0 || (byte) == 0xC1) + +/* This is the maximum byte length of multi-byte sequence. */ +#define MAX_MULTIBYTE_LENGTH 5 + +/* Return a Lisp character whose code is C. */ +#define make_char(c) make_number (c) + +/* Nonzero iff C is an ASCII byte. */ +#define ASCII_BYTE_P(c) ((unsigned) (c) < 0x80) + +/* Nonzero iff X is a character. */ +#define CHARACTERP(x) (NATNUMP (x) && XFASTINT (x) <= MAX_CHAR) + +/* Nozero iff C is valid as a charater code. GENERICP is not used + now. It will be removed in the future. */ +#define CHAR_VALID_P(c, genericp) CHARACTERP (c) + +/* Check if Lisp object X is a character or not. */ +#define CHECK_CHARACTER(x) \ + do { \ + if (! CHARACTERP(x)) x = wrong_type_argument (Qcharacterp, (x)); \ + } while (0) + +/* Nonzero iff C is an ASCII character. */ +#define ASCII_CHAR_P(c) ((unsigned) (c) < 0x80) + +/* Nonzero iff C is a character of code less than 0x100. */ +#define SINGLE_BYTE_CHAR_P(c) ((unsigned) (c) < 0x100) + +/* Nonzero if character C has a printable glyph. */ +#define CHAR_PRINTABLE_P(c) \ + (((c) >= 32 && ((c) < 127) \ + || ! NILP (CHAR_TABLE_REF (Vprintable_chars, (c))))) + +/* How many bytes C occupies in a multibyte buffer. */ +#define CHAR_BYTES(c) \ + ( (c) <= MAX_1_BYTE_CHAR ? 1 \ + : (c) <= MAX_2_BYTE_CHAR ? 2 \ + : (c) <= MAX_3_BYTE_CHAR ? 3 \ + : (c) <= MAX_4_BYTE_CHAR ? 4 \ + : (c) <= MAX_5_BYTE_CHAR ? 5 \ + : 2) + +/* Store multibyte form of the character C in STR. The caller should + allocate at least MAX_MULTIBYTE_LENGTH bytes area at STR in + advance. Returns the length of the multibyte form. */ + +#define CHAR_STRING(c, p) \ + ((unsigned) (c) <= MAX_1_BYTE_CHAR \ + ? ((p)[0] = (c), \ + 1) \ + : (unsigned) (c) <= MAX_2_BYTE_CHAR \ + ? ((p)[0] = (0xC0 | ((c) >> 6)), \ + (p)[1] = (0x80 | ((c) & 0x3F)), \ + 2) \ + : (unsigned) (c) <= MAX_3_BYTE_CHAR \ + ? ((p)[0] = (0xE0 | ((c) >> 12)), \ + (p)[1] = (0x80 | (((c) >> 6) & 0x3F)), \ + (p)[2] = (0x80 | ((c) & 0x3F)), \ + 3) \ + : (unsigned) (c) <= MAX_5_BYTE_CHAR \ + ? char_string_with_unification (c, p, NULL) \ + : ((p)[0] = (0xC0 | (((c) >> 6) & 0x01)), \ + (p)[1] = (0x80 | ((c) & 0x3F)), \ + 2)) + + +/* Like CHAR_STRING, but advance P to the end of the multibyte + form. */ + +#define CHAR_STRING_ADVANCE(c, p) \ + ((unsigned) (c) <= MAX_1_BYTE_CHAR \ + ? *(p)++ = (c) \ + : (unsigned) (c) <= MAX_2_BYTE_CHAR \ + ? (*(p)++ = (0xC0 | ((c) >> 6)), \ + *(p)++ = (0x80 | ((c) & 0x3F))) \ + : (unsigned) (c) <= MAX_3_BYTE_CHAR \ + ? (*(p)++ = (0xE0 | ((c) >> 12)), \ + *(p)++ = (0x80 | (((c) >> 6) & 0x3F)), \ + *(p)++ = (0x80 | ((c) & 0x3F))) \ + : (unsigned) (c) <= MAX_5_BYTE_CHAR \ + ? char_string_with_unification (c, p, &p) \ + : (*(p)++ = (0xC0 | (((c) >> 6) & 0x01)), \ + *(p)++ = (0x80 | ((c) & 0x3F)))) + + +/* Nonzero iff BYTE starts a character in a multibyte form. */ +#define CHAR_HEAD_P(byte) (((byte) & 0xC0) != 0x80) + +/* Nonzero iff BYTE starts a non-ASCII character in a multibyte + form. */ +#define LEADING_CODE_P(byte) (((byte) & 0xC0) == 0xC0) + +/* Just kept for backward compatibility. This macro will be removed + in the future. */ +#define BASE_LEADING_CODE_P LEADING_CODE_P + +/* How many bytes a character that starts with BYTE occupies in a + multibyte form. */ +#define BYTES_BY_CHAR_HEAD(byte) \ + (!((byte) & 0x80) ? 1 \ + : !((byte) & 0x20) ? 2 \ + : !((byte) & 0x10) ? 3 \ + : !((byte) & 0x08) ? 4 \ + : 5) + + +/* Return the length of the multi-byte form at string STR of length + LEN while assuming that STR points a valid multi-byte form. As + this macro isn't necessary anymore, all callers will be changed to + use BYTES_BY_CHAR_HEAD directly in the future. */ + +#define MULTIBYTE_FORM_LENGTH(str, len) \ + BYTES_BY_CHAR_HEAD (*(str)) + +/* Parse multibyte string STR of length LENGTH and set BYTES to the + byte length of a character at STR while assuming that STR points a + valid multibyte form. As this macro isn't necessary anymore, all + callers will be changed to use BYTES_BY_CHAR_HEAD directly in the + future. */ + +#define PARSE_MULTIBYTE_SEQ(str, length, bytes) \ + (bytes) = BYTES_BY_CHAR_HEAD (*(str)) + +/* The byte length of multibyte form at unibyte string P ending at + PEND. If STR doesn't point a valid multibyte form, return 0. */ + +#define MULTIBYTE_LENGTH(p, pend) \ + (p >= pend ? 0 \ + : !((p)[0] & 0x80) ? 1 \ + : ((p + 1 >= pend) || (((p)[1] & 0xC0) != 0x80)) ? 0 \ + : ((p)[0] & 0xE0) == 0xC0 ? 2 \ + : ((p + 2 >= pend) || (((p)[2] & 0xC0) != 0x80)) ? 0 \ + : ((p)[0] & 0xF0) == 0xE0 ? 3 \ + : ((p + 3 >= pend) || (((p)[3] & 0xC0) != 0x80)) ? 0 \ + : ((p)[0] & 0xF8) == 0xF0 ? 4 \ + : ((p + 4 >= pend) || (((p)[4] & 0xC0) != 0x80)) ? 0 \ + : (p)[0] == 0xF8 && ((p)[1] & 0xF0) == 0x80 ? 5 \ + : 0) + + +/* Like MULTIBYTE_LENGTH but don't check the ending address. */ + +#define MULTIBYTE_LENGTH_NO_CHECK(p) \ + (!((p)[0] & 0x80) ? 1 \ + : ((p)[1] & 0xC0) != 0x80 ? 0 \ + : ((p)[0] & 0xE0) == 0xC0 ? 2 \ + : ((p)[2] & 0xC0) != 0x80 ? 0 \ + : ((p)[0] & 0xF0) == 0xE0 ? 3 \ + : ((p)[3] & 0xC0) != 0x80 ? 0 \ + : ((p)[0] & 0xF8) == 0xF0 ? 4 \ + : ((p)[4] & 0xC0) != 0x80 ? 0 \ + : (p)[0] == 0xF8 && ((p)[1] & 0xF0) == 0x80 ? 5 \ + : 0) + + +/* Return the character code of character whose multibyte form is at + P. The argument LEN is ignored. It will be removed in the + future. */ + +#define STRING_CHAR(p, len) \ + (!((p)[0] & 0x80) \ + ? (p)[0] \ + : ! ((p)[0] & 0x20) \ + ? (((((p)[0] & 0x1F) << 6) \ + | ((p)[1] & 0x3F)) \ + + (((unsigned char) (p)[0]) < 0xC2 ? 0x3FFF80 : 0)) \ + : ! ((p)[0] & 0x10) \ + ? ((((p)[0] & 0x0F) << 12) \ + | (((p)[1] & 0x3F) << 6) \ + | ((p)[2] & 0x3F)) \ + : string_char_with_unification (p, NULL, NULL)) + + +/* Like STRING_CHAR but set ACTUAL_LEN to the length of multibyte + form. The argument LEN is ignored. It will be removed in the + future. */ + +#define STRING_CHAR_AND_LENGTH(p, len, actual_len) \ + (!((p)[0] & 0x80) \ + ? ((actual_len) = 1, (p)[0]) \ + : ! ((p)[0] & 0x20) \ + ? ((actual_len) = 2, \ + (((((p)[0] & 0x1F) << 6) \ + | ((p)[1] & 0x3F)) \ + + (((unsigned char) (p)[0]) < 0xC2 ? 0x3FFF80 : 0))) \ + : ! ((p)[0] & 0x10) \ + ? ((actual_len) = 3, \ + ((((p)[0] & 0x0F) << 12) \ + | (((p)[1] & 0x3F) << 6) \ + | ((p)[2] & 0x3F))) \ + : string_char_with_unification (p, NULL, &actual_len)) + + +/* Like STRING_CHAR but advacen P to the end of multibyte form. */ + +#define STRING_CHAR_ADVANCE(p) \ + (!((p)[0] & 0x80) \ + ? *(p)++ \ + : ! ((p)[0] & 0x20) \ + ? ((p) += 2, \ + ((((p)[-2] & 0x1F) << 6) \ + | ((p)[-1] & 0x3F) \ + | (((unsigned char) (p)[-2]) < 0xC2 ? 0x3FFF80 : 0))) \ + : ! ((p)[0] & 0x10) \ + ? ((p) += 3, \ + ((((p)[-3] & 0x0F) << 12) \ + | (((p)[-2] & 0x3F) << 6) \ + | ((p)[-1] & 0x3F))) \ + : string_char_with_unification (p, &p, NULL)) + + +/* Fetch the "next" character from Lisp string STRING at byte position + BYTEIDX, character position CHARIDX. Store it into OUTPUT. + + All the args must be side-effect-free. + BYTEIDX and CHARIDX must be lvalues; + we increment them past the character fetched. */ + +#define FETCH_STRING_CHAR_ADVANCE(OUTPUT, STRING, CHARIDX, BYTEIDX) \ + if (1) \ + { \ + CHARIDX++; \ + if (STRING_MULTIBYTE (STRING)) \ + { \ + unsigned char *ptr = &XSTRING (STRING)->data[BYTEIDX]; \ + int len; \ + \ + OUTPUT = STRING_CHAR_AND_LENGTH (ptr, 0, len); \ + BYTEIDX += len; \ + } \ + else \ + OUTPUT = XSTRING (STRING)->data[BYTEIDX++]; \ + } \ + else + + +/* Like FETCH_STRING_CHAR_ADVANCE but assumes STRING is multibyte. */ + +#define FETCH_STRING_CHAR_ADVANCE_NO_CHECK(OUTPUT, STRING, CHARIDX, BYTEIDX) \ + if (1) \ + { \ + unsigned char *ptr = &XSTRING (STRING)->data[BYTEIDX]; \ + int len; \ + \ + OUTPUT = STRING_CHAR_AND_LENGTH (ptr, 0, len); \ + BYTEIDX += len; \ + CHARIDX++; \ + } \ + else + + +/* Like FETCH_STRING_CHAR_ADVANCE but fetch character from the current + buffer. */ + +#define FETCH_CHAR_ADVANCE(OUTPUT, CHARIDX, BYTEIDX) \ + if (1) \ + { \ + CHARIDX++; \ + if (!NILP (current_buffer->enable_multibyte_characters)) \ + { \ + unsigned char *ptr = BYTE_POS_ADDR (BYTEIDX); \ + int len; \ + \ + OUTPUT= STRING_CHAR_AND_LENGTH (ptr, 0, len); \ + BYTEIDX += len; \ + } \ + else \ + { \ + OUTPUT = *(BYTE_POS_ADDR (BYTEIDX)); \ + BYTEIDX++; \ + } \ + } \ + else + + +/* Like FETCH_CHAR_ADVANCE but assumes STRING is multibyte. */ + +#define FETCH_CHAR_ADVANCE_NO_CHECK(OUTPUT, CHARIDX, BYTEIDX) \ + if (1) \ + { \ + unsigned char *ptr = BYTE_POS_ADDR (BYTEIDX); \ + int len; \ + \ + OUTPUT= STRING_CHAR_AND_LENGTH (ptr, 0, len); \ + BYTEIDX += len; \ + CHARIDX++; \ + } \ + else + + +/* Increase the buffer byte position POS_BYTE of the current buffer to + the next character boundary. No range checking of POS. */ + +#define INC_POS(pos_byte) \ + do { \ + unsigned char *p = BYTE_POS_ADDR (pos_byte); \ + pos_byte += BYTES_BY_CHAR_HEAD (*p); \ + } while (0) + + +/* Decrease the buffer byte position POS_BYTE of the current buffer to + the previous character boundary. No range checking of POS. */ + +#define DEC_POS(pos_byte) \ + do { \ + unsigned char *p; \ + \ + pos_byte--; \ + if (pos_byte < GPT_BYTE) \ + p = BEG_ADDR + pos_byte - 1; \ + else \ + p = BEG_ADDR + GAP_SIZE + pos_byte - 1; \ + while (!CHAR_HEAD_P (*p)) \ + { \ + p--; \ + pos_byte--; \ + } \ + } while (0) + +/* Increment both CHARPOS and BYTEPOS, each in the appropriate way. */ + +#define INC_BOTH(charpos, bytepos) \ + do \ + { \ + (charpos)++; \ + if (NILP (current_buffer->enable_multibyte_characters)) \ + (bytepos)++; \ + else \ + INC_POS ((bytepos)); \ + } \ + while (0) + + +/* Decrement both CHARPOS and BYTEPOS, each in the appropriate way. */ + +#define DEC_BOTH(charpos, bytepos) \ + do \ + { \ + (charpos)--; \ + if (NILP (current_buffer->enable_multibyte_characters)) \ + (bytepos)--; \ + else \ + DEC_POS ((bytepos)); \ + } \ + while (0) + + +/* Increase the buffer byte position POS_BYTE of the current buffer to + the next character boundary. This macro relies on the fact that + *GPT_ADDR and *Z_ADDR are always accessible and the values are + '\0'. No range checking of POS_BYTE. */ + +#define BUF_INC_POS(buf, pos_byte) \ + do { \ + unsigned char *p = BUF_BYTE_ADDRESS (buf, pos_byte); \ + pos_byte += BYTES_BY_CHAR_HEAD (*p); \ + } while (0) + + +/* Decrease the buffer byte position POS_BYTE of the current buffer to + the previous character boundary. No range checking of POS_BYTE. */ + +#define BUF_DEC_POS(buf, pos_byte) \ + do { \ + unsigned char *p; \ + pos_byte--; \ + if (pos_byte < BUF_GPT_BYTE (buf)) \ + p = BUF_BEG_ADDR (buf) + pos_byte - 1; \ + else \ + p = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - 1; \ + while (!CHAR_HEAD_P (*p)) \ + { \ + p--; \ + pos_byte--; \ + } \ + } while (0) + + +#define MAYBE_UNIFY_CHAR(c) \ + if (CHAR_TABLE_P (Vchar_unify_table)) \ + { \ + Lisp_Object val; \ + int unified; \ + \ + val = CHAR_TABLE_REF (Vchar_unify_table, c); \ + if (SYMBOLP (val)) \ + { \ + Funify_charset (val, Qnil); \ + val = CHAR_TABLE_REF (Vchar_unify_table, c); \ + } \ + if ((unified = XINT (val)) >= 0) \ + c = unified; \ + } \ + else + +/* Return the width of ASCII character C. The width is measured by + how many columns occupied on the screen when displayed in the + current buffer. */ + +#define ASCII_CHAR_WIDTH(c) \ + (c < 0x20 \ + ? (c == '\t' \ + ? XFASTINT (current_buffer->tab_width) \ + : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \ + : (c < 0x7f \ + ? 1 \ + : ((NILP (current_buffer->ctl_arrow) ? 4 : 2)))) + +/* Return the width of character C. The width is measured by how many + columns occupied on the screen when displayed in the current + buffer. */ + +#define CHAR_WIDTH(c) \ + (ASCII_CHAR_P (c) \ + ? ASCII_CHAR_WIDTH (c) \ + : XINT (CHAR_TABLE_REF (Vchar_width_table, c))) + +extern int char_string_with_unification P_ ((int, unsigned char *, + unsigned char **)); +extern int string_char_with_unification P_ ((unsigned char *, + unsigned char **, int *)); + +extern int translate_char P_ ((Lisp_Object, int c)); +extern int char_printable_p P_ ((int c)); +extern void parse_str_as_multibyte P_ ((unsigned char *, int, int *, int *)); +extern int parse_str_to_multibyte P_ ((unsigned char *, int)); +extern int str_as_multibyte P_ ((unsigned char *, int, int, int *)); +extern int str_to_multibyte P_ ((unsigned char *, int, int)); +extern int str_as_unibyte P_ ((unsigned char *, int)); +extern int strwidth P_ ((unsigned char *, int)); +extern int c_string_width P_ ((unsigned char *, int, int, int *, int *)); +extern int lisp_string_width P_ ((Lisp_Object, int, int *, int *)); + +extern Lisp_Object Vprintable_chars; + +extern Lisp_Object Qcharacterp, Qauto_fill_chars; +extern Lisp_Object Vtranslation_table_vector; +extern Lisp_Object Vchar_width_table; +extern Lisp_Object Vchar_direction_table; +extern Lisp_Object Vchar_unify_table; + +/* Return a translation table of id number ID. */ +#define GET_TRANSLATION_TABLE(id) \ + (XCDR(XVECTOR(Vtranslation_table_vector)->contents[(id)])) + +/* A char-table for characters which may invoke auto-filling. */ +extern Lisp_Object Vauto_fill_chars; + +/* Copy LEN bytes from FROM to TO. This macro should be used only + when a caller knows that LEN is short and the obvious copy loop is + faster than calling bcopy which has some overhead. Copying a + multibyte sequence of a character is the typical case. */ + +#define BCOPY_SHORT(from, to, len) \ + do { \ + int i = len; \ + unsigned char *from_p = from, *to_p = to; \ + while (i--) *to_p++ = *from_p++; \ + } while (0) + +#define DEFSYM(sym, name) \ + do { (sym) = intern ((name)); staticpro (&(sym)); } while (0) + +#endif /* EMACS_CHARACTER_H */ -- 2.39.5