Improve font search and handling on MS-Windows

author Eli Zaretskii <eliz@gnu.org>

Sat, 3 Aug 2024 15:11:57 +0000 (18:11 +0300)

committer Eshel Yaron <me@eshelyaron.com>

Tue, 6 Aug 2024 09:54:49 +0000 (11:54 +0200)
author Eli Zaretskii <eliz@gnu.org>
Sat, 3 Aug 2024 15:11:57 +0000 (18:11 +0300)
committer Eshel Yaron <me@eshelyaron.com>
Tue, 6 Aug 2024 09:54:49 +0000 (11:54 +0200)
diff --git a/lisp/international/fontset.el b/lisp/international/fontset.el

index 695c313cb264dfb2187c99acb075884857019989..c9b60418b227e37792320d410124f59d2715362d 100644 (file)
--- a/lisp/international/fontset.el
+++ b/lisp/international/fontset.el
@@ -88,6 +88,7 @@
         ("iso10646-1$" . (unicode-bmp . nil))
         ("iso10646.indian-1" . (unicode-bmp . nil))
         ("unicode-bmp" . (unicode-bmp . nil))
+        ("unicode-sip" . (unicode-sip . nil)) ; used by w32font.c
         ("abobe-symbol" . symbol)
         ("sisheng_cwnn" . chinese-sisheng)
         ("mulearabic-0" . arabic-digit)
diff --git a/src/font.c b/src/font.c

index 246fe1c4426e3d25a1a288b1cc9564308d9a5cde..112618a73078c0354c4b67853cc18e79b7781a5d 100644 (file)
--- a/src/font.c
+++ b/src/font.c
@@ -1627,15 +1627,30 @@ font_parse_fcname (char *name, ptrdiff_t len, Lisp_Object font)
         {
           bool decimal = 0, size_found = 1;
           for (q = p + 1; *q && *q != ':'; q++)
-           if (! c_isdigit (*q))
-             {
-               if (*q != '.' || decimal)
-                 {
-                   size_found = 0;
-                   break;
-                 }
-               decimal = 1;
-             }
+           {
+#ifdef HAVE_NTGUI
+             /* MS-Windows has several CJK fonts whose name ends in
+                 "-ExtB".  It also has fonts whose names end in "-R" or
+                 "-B", and one font whose name ends in "-SB".  */
+             if (q == p + 1 && (strncmp (q, "ExtB", 4) == 0
+                                || strncmp (q, "R", 1) == 0
+                                || strncmp (q, "B", 1) == 0
+                                || strncmp (q, "SB", 2) == 0))
+               {
+                 size_found = 0;
+                 break;
+               }
+#endif
+             if (! c_isdigit (*q))
+               {
+                 if (*q != '.' || decimal)
+                   {
+                     size_found = 0;
+                     break;
+                   }
+                 decimal = 1;
+               }
+           }
           if (size_found)
             {
               family_end = p;
@@ -2000,6 +2015,15 @@ font_parse_family_registry (Lisp_Object family, Lisp_Object registry, Lisp_Objec
        len = SBYTES (family);
        p0 = SSDATA (family);
        p1 = strchr (p0, '-');
+#ifdef HAVE_NTGUI
+      /* MS-Windows has fonts whose family name ends in "-ExtB" and
+         other suffixes which include a hyphen.  */
+      if (p1 && (strcmp (p1, "-ExtB") == 0
+                || strcmp (p1, "-R") == 0
+                || strcmp (p1, "-B") == 0
+                || strcmp (p1, "-SB") == 0))
+       p1 = NULL;
+#endif
        if (p1)
         {
           if ((*p0 != '*' && p1 - p0 > 0)
diff --git a/src/w32font.c b/src/w32font.c

index ccbd3837afbe0ad5d073a68933775affe14a7b5a..efb42d803365cc4c6e64a81c0470eb3c53283ebb 100644 (file)
--- a/src/w32font.c
+++ b/src/w32font.c
@@ -809,6 +809,93 @@ w32font_otf_drive (struct font *font, Lisp_Object features,
                     bool alternate_subst);
    */
  
+/* Notes about the way fonts are found on MS-Windows when we have a
+   character unsupported by the default font.
+
+   Since we don't use Fontconfig on MS-Windows, we cannot efficiently
+   search for fonts which support certain characters, because Windows
+   doesn't store this information anywhere, and we can only know whether
+   a font supports some character if we actually open the font, which is
+   expensive and slow.  Instead, we rely on font information Windows
+   exposes to the API we use to enumerate available fonts,
+   EnumFontFamiliesEx.  This information includes two bitmapped attributes:
+
+     USB (which stands for Unicode Subset Bitfields) -- this is an array
+         of 4 32-bit values, 128 bits in total, where each bit
+         corresponds to some block (sometimes several related blocks) of
+         Unicode codepoints which the font claims to support.
+     CSB (which stands for Codepage Bitfields) -- this is an array of 2
+        32-bit values (64 bits), where each bit corresponds to some
+        codepage whose characters the font claims to support.
+
+   When Emacs needs to find a font for a character, it enumerates the
+   available fonts, filtering the fonts by examining these bitmaps and a
+   few other font attributes.  The script of the character is converted
+   to the corresponding bits in USB, and a font that has any of these
+   bits set is deemed as a candidate; see font_supported_scripts, which
+   is called by font_matches_spec.  The problem with this strategy is
+   twofold:
+
+    - Some Unicode blocks have no USB bits.  For the scripts
+      corresponding to those blocks we use a small cache of fonts known
+      to support those script.  This cache is calculated once, and needs
+      not be recalculated as long as no fonts are installed or deleted
+      (it can be saved in your init file and reused for the following
+      sessions).  See the function w32-find-non-USB-fonts.  Note that
+      for that function to work well, 'script-representative-chars'
+      should include the important characters for each script which has
+      no USB bits defined.
+
+    - Some fonts claim support for a block, but don't support it well.
+      Other fonts support some blocks very well, but don't set the
+      corresponding USB bits for the blocks.  For these we use some
+      heuristics:
+
+      . For few fonts that claim coverage, but don't provide it, we
+       either recognize them by name and reject their false claims, or
+       let users set face-ignored-fonts to ignore those fonts.
+
+      . For fonts that support some blocks very well, but don't set
+       their USB bits, we examine the CSB bits instead.  This is
+       particularly important for some CJK fonts with good support in
+       the SIP area: they only set the SIP bit (bit 57) in the USB.  We
+       consider those as candidates for CJK scripts ('han', 'kana',
+       etc.) if the CSB bits are set for the corresponding CJK
+       codepages.
+
+   Eventually, some characters could still appear as "tofu" (a box with
+   the character's hex codepoint), even though a font might be available
+   on the system which supports the character.  This is because the
+   above strategy, with all its heuristics and tricks, sometimes fails.
+   For example, it could fail if the system has several fonts installed
+   whose coverage of some blocks is incomplete -- Emacs could select
+   such a font based on its USB bits, and realize the font has no glyph
+   for a character only when it's too late.  This happens because when
+   several fonts claim coverage of the same Unicode block, Emacs on
+   Windows has no way of preferring one over the other, if they all
+   support the same values of size, weight, and slant.  So Emacs usually
+   selects the first such candidate, which could lack glyphs for the
+   characters Emacs needs to display.  Since we avoid naming non-free
+   Windows fonts in Emacs's sources, this cannot be fixed in the the
+   default fontset setup provided by Emacs: we cannot arrange for the
+   "good" fonts to be used in all such cases, because that would mean
+   naming those fonts.  The solution for thes issues is to customize the
+   default fontset using set-fontset-font, to force Emacs to use a font
+   known to support some characters.
+
+   One other Windows-specific issue is the fact that some Windows fonts
+   have hyphens in their names.  Emacs generally follows the XLFD
+   specifications, where a hyphen is used as separator between segments
+   of a font spec.  There are few places in the code in font.c where
+   Emacs handles such font names specially, and it currently knows about
+   font names documented for Windows versions up to and including 11.
+   See this page for the latest update:
+
+     https://learn.microsoft.com/en-us/typography/fonts/windows_11_font_list
+
+   If more fonts are added to Windows that have hyphens in their names,
+   the code in font.c will need to be updated.  */
+
  /* Internal implementation of w32font_list.
     Additional parameter opentype_only restricts the returned fonts to
     opentype fonts, which can be used with the Uniscribe backend.  */
@@ -1455,22 +1542,34 @@ static int
  w32font_coverage_ok (FONTSIGNATURE * coverage, BYTE charset)
  {
    DWORD subrange1 = coverage->fsUsb[1];
+  DWORD codepages0 = coverage->fsCsb[0];
  
  #define SUBRANGE1_HAN_MASK 0x08000000
  #define SUBRANGE1_HANGEUL_MASK 0x01000000
  #define SUBRANGE1_JAPANESE_MASK (0x00060000 | SUBRANGE1_HAN_MASK)
+#define SUBRANGE1_SIP_MASK 0x02000000
  
+/* We consider the coverage to be OK if either (a) subrange1 has the
+   bits set that correspond to CHARSET, or (b) subrange1 indicates SIP
+   support and codepages0 has one or more bits set corresponding to
+   CHARSET.  */
    if (charset == GB2312_CHARSET || charset == CHINESEBIG5_CHARSET)
      {
-      return (subrange1 & SUBRANGE1_HAN_MASK) == SUBRANGE1_HAN_MASK;
+      return ((subrange1 & SUBRANGE1_HAN_MASK) == SUBRANGE1_HAN_MASK
+             || ((subrange1 & SUBRANGE1_SIP_MASK) != 0
+                 && (codepages0 & CSB_CHINESE) != 0));
      }
    else if (charset == SHIFTJIS_CHARSET)
      {
-      return (subrange1 & SUBRANGE1_JAPANESE_MASK) == SUBRANGE1_JAPANESE_MASK;
+      return ((subrange1 & SUBRANGE1_JAPANESE_MASK) == SUBRANGE1_JAPANESE_MASK
+             || ((subrange1 & SUBRANGE1_SIP_MASK) != 0
+                 && (codepages0 & CSB_JAPANESE) != 0));
      }
    else if (charset == HANGEUL_CHARSET)
      {
-      return (subrange1 & SUBRANGE1_HANGEUL_MASK) == SUBRANGE1_HANGEUL_MASK;
+      return ((subrange1 & SUBRANGE1_HANGEUL_MASK) == SUBRANGE1_HANGEUL_MASK
+             || ((subrange1 & SUBRANGE1_SIP_MASK) != 0
+                 && (codepages0 & CSB_KOREAN) != 0));
      }
  
    return 1;
@@ -1620,11 +1719,18 @@ add_font_entity_to_list (ENUMLOGFONTEX *logical_font,
         }
        /* unicode-sip fonts must contain characters in Unicode plane 2.
          so look for bit 57 (surrogates) in the Unicode subranges, plus
-        the bits for CJK ranges that include those characters.  */
+        the bits for CJK ranges that include those characters or CJK
+        bits in code-page bit fields..  */
        else if (EQ (spec_charset, Qunicode_sip))
         {
-         if (!(physical_font->ntmFontSig.fsUsb[1] & 0x02000000)
-             || !(physical_font->ntmFontSig.fsUsb[1] & 0x28000000))
+         if (!((physical_font->ntmFontSig.fsUsb[1] & 0x02000000)
+               && ((physical_font->ntmFontSig.fsUsb[1] & 0x28000000)
+                   /* Some CJK fonts with very good coverage of SIP
+                       characters have only the 0x02000000 bit in USB
+                       set, so we allow them if their code-page bits
+                       indicate support for CJK character sets.  */
+                   || (physical_font->ntmFontSig.fsCsb[0]
+                       & (CSB_CHINESE | CSB_JAPANESE | CSB_KOREAN)))))
             return 1;
         }
  
@@ -2328,7 +2434,18 @@ font_supported_scripts (FONTSIGNATURE * sig)
    SUBRANGE (53, Qphags_pa);
    /* 54: Enclosed CJK letters and months, 55: CJK Compatibility.  */
    SUBRANGE (56, Qhangul);
-  /* 57: Surrogates.  */
+  /* 57: Non-BMP.  Processed specially: Several fonts that support CJK
+     Ideographs Extensions and other extensions, set just this bit and
+     Latin, and nothing else.  */
+  if (subranges[57 / 32] & (1U << (57 % 32)))
+    {
+      if ((sig->fsCsb[0] & CSB_CHINESE))
+       supported = Fcons (Qhan, supported);
+      if ((sig->fsCsb[0] & CSB_JAPANESE))
+       supported = Fcons (Qkana, supported);
+      if ((sig->fsCsb[0] & CSB_KOREAN))
+       supported = Fcons (Qhangul, supported);
+    }
    SUBRANGE (58, Qphoenician);
    SUBRANGE (59, Qhan); /* There are others, but this is the main one.  */
    SUBRANGE (59, Qideographic_description); /* Windows lumps this in.  */
@@ -2385,7 +2502,7 @@ font_supported_scripts (FONTSIGNATURE * sig)
    SUBRANGE (97, Qglagolitic);
    SUBRANGE (98, Qtifinagh);
    /* 99: Yijing Hexagrams.  */
-  SUBRANGE (99, Qhan);
+  SUBRANGE (99, Qcjk_misc);
    SUBRANGE (100, Qsyloti_nagri);
    SUBRANGE (101, Qlinear_b);
    SUBRANGE (101, Qaegean_number);
diff --git a/src/w32uniscribe.c b/src/w32uniscribe.c

index 471bdf544d8dd7d24bb888d7deda3330c85ccf23..751963705d26621e6e351bbc538580fc35b5632f 100644 (file)
--- a/src/w32uniscribe.c
+++ b/src/w32uniscribe.c
@@ -895,7 +895,7 @@ uniscribe_check_otf_1 (HDC context, Lisp_Object script, Lisp_Object lang,
                        Lisp_Object features[2], int *retval)
  {
    SCRIPT_CACHE cache = NULL;
-  OPENTYPE_TAG tags[32], script_tag, lang_tag;
+  OPENTYPE_TAG tags[128], script_tag, lang_tag;
    int max_tags = ARRAYELTS (tags);
    int ntags, i, ret = 0;
    HRESULT rslt;
author	Eli Zaretskii <eliz@gnu.org>
	Sat, 3 Aug 2024 15:11:57 +0000 (18:11 +0300)
committer	Eshel Yaron <me@eshelyaron.com>
	Tue, 6 Aug 2024 09:54:49 +0000 (11:54 +0200)
lisp/international/fontset.el		patch \| blob \| history
src/font.c		patch \| blob \| history
src/w32font.c		patch \| blob \| history
src/w32uniscribe.c		patch \| blob \| history