From: Eli Zaretskii Date: Fri, 29 Aug 2014 19:18:06 +0000 (+0300) Subject: Implement case-insensitive and Unicode-compliant collation on MS-Windows. X-Git-Tag: emacs-25.0.90~2635^2~679^2~396 X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=21ba51de76390907ca86b1e7715f472dd740fbc3;p=emacs.git Implement case-insensitive and Unicode-compliant collation on MS-Windows. src/fns.c (Fstring_collate_lessp, Fstring_collate_equalp): Doc fix. src/w32proc.c (w32_compare_strings): Accept additional argument IGNORE_CASE. Set up the flags for CompareStringW to ignore case if requested. If w32-collate-ignore-punctuation is non-nil, add NORM_IGNORESYMBOLS to the flags. (LINGUISTIC_IGNORECASE): Define if not already defined. (syms_of_ntproc) : New variable. src/sysdep.c (str_collate) [WINDOWSNT]: Adapt to the interface change. src/w32.h: Adjust prototype of w32_compare_strings. etc/NEWS: Mention w32-collate-ignore-punctuation. Fixes: debbugs:18051 --- diff --git a/etc/ChangeLog b/etc/ChangeLog index 4968d8d7203..edec0dd2c13 100644 --- a/etc/ChangeLog +++ b/etc/ChangeLog @@ -1,3 +1,7 @@ +2014-08-29 Eli Zaretskii + + * NEWS: Mention w32-collate-ignore-punctuation. + 2014-08-29 Dmitry Antipov * NEWS: Mention that `sort' can handle vectors. diff --git a/etc/NEWS b/etc/NEWS index 47ad2b91ebd..c82f2b976ff 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -72,6 +72,13 @@ environment. For the time being this is implemented for modern POSIX systems and for MS-Windows, for other systems they fall back to their counterparts `string-lessp' and `string-equal'. +*** The MS-Windows specific variable `w32-collate-ignore-punctuation', +if set to a non-nil value, causes the above 2 functions to ignore +symbol and punctuation characters when collating strings. This +emulates the behavior of modern Posix platforms when the locale's +codeset is "UTF-8" (as in "en_US.UTF-8"). This is needed because +MS-Windows doesn't support UTF-8 as codeset in its locales. + * Editing Changes in Emacs 24.5 diff --git a/src/ChangeLog b/src/ChangeLog index 66588bc3e67..181a43d058f 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,17 @@ +2014-08-29 Eli Zaretskii + + * fns.c (Fstring_collate_lessp, Fstring_collate_equalp): Doc fix. + + * w32proc.c (w32_compare_strings): Accept additional argument + IGNORE_CASE. Set up the flags for CompareStringW to ignore case + if requested. If w32-collate-ignore-punctuation is non-nil, add + NORM_IGNORESYMBOLS to the flags. + (LINGUISTIC_IGNORECASE): Define if not already defined. + (syms_of_ntproc) : New variable. + + * sysdep.c (str_collate) [WINDOWSNT]: Adapt to the interface + change. + 2014-08-29 Michael Albinus * sysdep.c (LC_CTYPE, LC_CTYPE_MASK, towlower_l): diff --git a/src/fns.c b/src/fns.c index 3cca40df50f..f838599230b 100644 --- a/src/fns.c +++ b/src/fns.c @@ -350,7 +350,7 @@ Symbols are also allowed; their print names are used instead. This function obeys the conventions for collation order in your locale settings. For example, punctuation and whitespace characters -are considered less significant for sorting: +might be considered less significant for sorting: \(sort '\("11" "12" "1 1" "1 2" "1.1" "1.2") 'string-collate-lessp) => \("11" "1 1" "1.1" "12" "1 2" "1.2") @@ -358,11 +358,15 @@ are considered less significant for sorting: The optional argument LOCALE, a string, overrides the setting of your current locale identifier for collation. The value is system dependent; a LOCALE \"en_US.UTF-8\" is applicable on POSIX systems, -while it would be \"English_USA.1252\" on MS Windows systems. +while it would be, e.g., \"enu_USA.1252\" on MS-Windows systems. If IGNORE-CASE is non-nil, characters are converted to lower-case before comparing them. +To emulate Unicode-compliant collation on MS-Windows systems, +bind `w32-collate-ignore-punctuation' to a non-nil value, since +the codeset part of the locale cannot be \"UTF-8\" on MS-Windows. + If your system does not support a locale environment, this function behaves like `string-lessp'. */) (Lisp_Object s1, Lisp_Object s2, Lisp_Object locale, Lisp_Object ignore_case) @@ -391,8 +395,8 @@ Symbols are also allowed; their print names are used instead. This function obeys the conventions for collation order in your locale settings. For example, characters with different coding points but -the same meaning are considered as equal, like different grave accent -unicode characters: +the same meaning might be considered as equal, like different grave +accent Unicode characters: \(string-collate-equalp \(string ?\\uFF40) \(string ?\\u1FEF)) => t @@ -400,13 +404,20 @@ unicode characters: The optional argument LOCALE, a string, overrides the setting of your current locale identifier for collation. The value is system dependent; a LOCALE \"en_US.UTF-8\" is applicable on POSIX systems, -while it would be \"English_USA.1252\" on MS Windows systems. +while it would be \"enu_USA.1252\" on MS Windows systems. If IGNORE-CASE is non-nil, characters are converted to lower-case before comparing them. +To emulate Unicode-compliant collation on MS-Windows systems, +bind `w32-collate-ignore-punctuation' to a non-nil value, since +the codeset part of the locale cannot be \"UTF-8\" on MS-Windows. + If your system does not support a locale environment, this function -behaves like `string-equal'. */) +behaves like `string-equal'. + +Do NOT use this function to compare file names for equality, only +for sorting them. */) (Lisp_Object s1, Lisp_Object s2, Lisp_Object locale, Lisp_Object ignore_case) { #if defined __STDC_ISO_10646__ || defined WINDOWSNT diff --git a/src/sysdep.c b/src/sysdep.c index 7993a59e721..52a72385f46 100644 --- a/src/sysdep.c +++ b/src/sysdep.c @@ -3796,6 +3796,6 @@ str_collate (Lisp_Object s1, Lisp_Object s2, char *loc = STRINGP (locale) ? SSDATA (locale) : NULL; - return w32_compare_strings (SDATA (s1), SDATA (s2), loc); + return w32_compare_strings (SDATA (s1), SDATA (s2), loc, !NILP (ignore_case)); } #endif /* WINDOWSNT */ diff --git a/src/w32.h b/src/w32.h index 68ee14c70e3..2cc179a0c36 100644 --- a/src/w32.h +++ b/src/w32.h @@ -211,7 +211,7 @@ extern int w32_memory_info (unsigned long long *, unsigned long long *, unsigned long long *, unsigned long long *); /* Compare 2 UTF-8 strings in locale-dependent fashion. */ -extern int w32_compare_strings (const char *, const char *, char *); +extern int w32_compare_strings (const char *, const char *, char *, int); #ifdef HAVE_GNUTLS #include diff --git a/src/w32proc.c b/src/w32proc.c index ed62de02433..0b441d45186 100644 --- a/src/w32proc.c +++ b/src/w32proc.c @@ -3213,15 +3213,20 @@ get_lcid (const char *locale_name) #ifndef _NSLCMPERROR # define _NSLCMPERROR INT_MAX #endif +#ifndef LINGUISTIC_IGNORECASE +# define LINGUISTIC_IGNORECASE 0x00000010 +#endif int -w32_compare_strings (const char *s1, const char *s2, char *locname) +w32_compare_strings (const char *s1, const char *s2, char *locname, + int ignore_case) { LCID lcid = GetThreadLocale (); wchar_t *string1_w, *string2_w; int val, needed; extern BOOL g_b_init_compare_string_w; static int (WINAPI *pCompareStringW)(LCID, DWORD, LPCWSTR, int, LPCWSTR, int); + DWORD flags = 0; USE_SAFE_ALLOCA; @@ -3284,11 +3289,22 @@ w32_compare_strings (const char *s1, const char *s2, char *locname) lcid = new_lcid; } - /* FIXME: Need a way to control the FLAGS argument, perhaps via the - CODESET part of LOCNAME. In particular, ls-lisp will want - NORM_IGNORESYMBOLS and sometimes LINGUISTIC_IGNORECASE or - NORM_IGNORECASE. */ - val = pCompareStringW (lcid, 0, string1_w, -1, string2_w, -1); + if (ignore_case) + { + /* NORM_IGNORECASE ignores any tertiary distinction, not just + case variants. LINGUISTIC_IGNORECASE is more selective, and + is sensitive to the locale's language, but it is not + available before Vista. */ + if (w32_major_version >= 6) + flags |= LINGUISTIC_IGNORECASE; + else + flags |= NORM_IGNORECASE; + } + /* This approximates what glibc collation functions do when the + locale's codeset is UTF-8. */ + if (!NILP (Vw32_collate_ignore_punctuation)) + flags |= NORM_IGNORESYMBOLS; + val = pCompareStringW (lcid, flags, string1_w, -1, string2_w, -1); SAFE_FREE (); if (!val) { @@ -3408,6 +3424,20 @@ Any other non-nil value means do this even on remote and removable drives where the performance impact may be noticeable even on modern hardware. */); Vw32_get_true_file_attributes = Qlocal; + DEFVAR_LISP ("w32-collate-ignore-punctuation", + Vw32_collate_ignore_punctuation, + doc: /* Non-nil causes string collation functions ignore punctuation on MS-Windows. +On Posix platforms, `string-collate-lessp' and `string-collate-equalp' +ignore punctuation characters when they compare strings, if the +locale's codeset is UTF-8, as in \"en_US.UTF-8\". Binding this option +to a non-nil value will achieve a similar effect on MS-Windows, where +locales with UTF-8 codeset are not supported. + +Note that setting this to non-nil will also ignore blanks and symbols +in the strings. So do NOT use this option when comparing file names +for equality, only when you need to sort them. */); + Vw32_collate_ignore_punctuation = Qnil; + staticpro (&Vw32_valid_locale_ids); staticpro (&Vw32_valid_codepages); }