From 20a4dfca1c812492465f4cd6374c74a707305009 Mon Sep 17 00:00:00 2001 From: Eli Zaretskii Date: Sat, 3 Aug 2024 18:11:57 +0300 Subject: [PATCH] Improve font search and handling on MS-Windows * src/w32font.c: Add commentary about font search on MS-Windows. (w32font_coverage_ok, add_font_entity_to_list) (font_supported_scripts): Consider the coverage OK if a font has only the SIP bit set, but also sets relevant codepage bits in the CSB bits. (font_supported_scripts): Fix script for USB bit 99. * src/font.c (font_parse_fcname, font_parse_family_registry) [HAVE_NTGUI]: Don't consider hyphenated suffixes of some Windows fonts as not belonging to the family name. * src/w32uniscribe.c (uniscribe_check_otf_1): Increase tags[] array size, to avoid the E_OUTOFMEMORY error for some fonts. * lisp/international/fontset.el (font-encoding-alist): Add 'unicode-sip'. (cherry picked from commit ff6954b9c833bfeb8032fb772fa08e60e9ec56a8) --- lisp/international/fontset.el | 1 + src/font.c | 42 ++++++++--- src/w32font.c | 133 ++++++++++++++++++++++++++++++++-- src/w32uniscribe.c | 2 +- 4 files changed, 160 insertions(+), 18 deletions(-) diff --git a/lisp/international/fontset.el b/lisp/international/fontset.el index 695c313cb26..c9b60418b22 100644 --- a/lisp/international/fontset.el +++ b/lisp/international/fontset.el @@ -88,6 +88,7 @@ ("iso10646-1$" . (unicode-bmp . nil)) ("iso10646.indian-1" . (unicode-bmp . nil)) ("unicode-bmp" . (unicode-bmp . nil)) + ("unicode-sip" . (unicode-sip . nil)) ; used by w32font.c ("abobe-symbol" . symbol) ("sisheng_cwnn" . chinese-sisheng) ("mulearabic-0" . arabic-digit) diff --git a/src/font.c b/src/font.c index 246fe1c4426..112618a7307 100644 --- a/src/font.c +++ b/src/font.c @@ -1627,15 +1627,30 @@ font_parse_fcname (char *name, ptrdiff_t len, Lisp_Object font) { bool decimal = 0, size_found = 1; for (q = p + 1; *q && *q != ':'; q++) - if (! c_isdigit (*q)) - { - if (*q != '.' || decimal) - { - size_found = 0; - break; - } - decimal = 1; - } + { +#ifdef HAVE_NTGUI + /* MS-Windows has several CJK fonts whose name ends in + "-ExtB". It also has fonts whose names end in "-R" or + "-B", and one font whose name ends in "-SB". */ + if (q == p + 1 && (strncmp (q, "ExtB", 4) == 0 + || strncmp (q, "R", 1) == 0 + || strncmp (q, "B", 1) == 0 + || strncmp (q, "SB", 2) == 0)) + { + size_found = 0; + break; + } +#endif + if (! c_isdigit (*q)) + { + if (*q != '.' || decimal) + { + size_found = 0; + break; + } + decimal = 1; + } + } if (size_found) { family_end = p; @@ -2000,6 +2015,15 @@ font_parse_family_registry (Lisp_Object family, Lisp_Object registry, Lisp_Objec len = SBYTES (family); p0 = SSDATA (family); p1 = strchr (p0, '-'); +#ifdef HAVE_NTGUI + /* MS-Windows has fonts whose family name ends in "-ExtB" and + other suffixes which include a hyphen. */ + if (p1 && (strcmp (p1, "-ExtB") == 0 + || strcmp (p1, "-R") == 0 + || strcmp (p1, "-B") == 0 + || strcmp (p1, "-SB") == 0)) + p1 = NULL; +#endif if (p1) { if ((*p0 != '*' && p1 - p0 > 0) diff --git a/src/w32font.c b/src/w32font.c index ccbd3837afb..efb42d80336 100644 --- a/src/w32font.c +++ b/src/w32font.c @@ -809,6 +809,93 @@ w32font_otf_drive (struct font *font, Lisp_Object features, bool alternate_subst); */ +/* Notes about the way fonts are found on MS-Windows when we have a + character unsupported by the default font. + + Since we don't use Fontconfig on MS-Windows, we cannot efficiently + search for fonts which support certain characters, because Windows + doesn't store this information anywhere, and we can only know whether + a font supports some character if we actually open the font, which is + expensive and slow. Instead, we rely on font information Windows + exposes to the API we use to enumerate available fonts, + EnumFontFamiliesEx. This information includes two bitmapped attributes: + + USB (which stands for Unicode Subset Bitfields) -- this is an array + of 4 32-bit values, 128 bits in total, where each bit + corresponds to some block (sometimes several related blocks) of + Unicode codepoints which the font claims to support. + CSB (which stands for Codepage Bitfields) -- this is an array of 2 + 32-bit values (64 bits), where each bit corresponds to some + codepage whose characters the font claims to support. + + When Emacs needs to find a font for a character, it enumerates the + available fonts, filtering the fonts by examining these bitmaps and a + few other font attributes. The script of the character is converted + to the corresponding bits in USB, and a font that has any of these + bits set is deemed as a candidate; see font_supported_scripts, which + is called by font_matches_spec. The problem with this strategy is + twofold: + + - Some Unicode blocks have no USB bits. For the scripts + corresponding to those blocks we use a small cache of fonts known + to support those script. This cache is calculated once, and needs + not be recalculated as long as no fonts are installed or deleted + (it can be saved in your init file and reused for the following + sessions). See the function w32-find-non-USB-fonts. Note that + for that function to work well, 'script-representative-chars' + should include the important characters for each script which has + no USB bits defined. + + - Some fonts claim support for a block, but don't support it well. + Other fonts support some blocks very well, but don't set the + corresponding USB bits for the blocks. For these we use some + heuristics: + + . For few fonts that claim coverage, but don't provide it, we + either recognize them by name and reject their false claims, or + let users set face-ignored-fonts to ignore those fonts. + + . For fonts that support some blocks very well, but don't set + their USB bits, we examine the CSB bits instead. This is + particularly important for some CJK fonts with good support in + the SIP area: they only set the SIP bit (bit 57) in the USB. We + consider those as candidates for CJK scripts ('han', 'kana', + etc.) if the CSB bits are set for the corresponding CJK + codepages. + + Eventually, some characters could still appear as "tofu" (a box with + the character's hex codepoint), even though a font might be available + on the system which supports the character. This is because the + above strategy, with all its heuristics and tricks, sometimes fails. + For example, it could fail if the system has several fonts installed + whose coverage of some blocks is incomplete -- Emacs could select + such a font based on its USB bits, and realize the font has no glyph + for a character only when it's too late. This happens because when + several fonts claim coverage of the same Unicode block, Emacs on + Windows has no way of preferring one over the other, if they all + support the same values of size, weight, and slant. So Emacs usually + selects the first such candidate, which could lack glyphs for the + characters Emacs needs to display. Since we avoid naming non-free + Windows fonts in Emacs's sources, this cannot be fixed in the the + default fontset setup provided by Emacs: we cannot arrange for the + "good" fonts to be used in all such cases, because that would mean + naming those fonts. The solution for thes issues is to customize the + default fontset using set-fontset-font, to force Emacs to use a font + known to support some characters. + + One other Windows-specific issue is the fact that some Windows fonts + have hyphens in their names. Emacs generally follows the XLFD + specifications, where a hyphen is used as separator between segments + of a font spec. There are few places in the code in font.c where + Emacs handles such font names specially, and it currently knows about + font names documented for Windows versions up to and including 11. + See this page for the latest update: + + https://learn.microsoft.com/en-us/typography/fonts/windows_11_font_list + + If more fonts are added to Windows that have hyphens in their names, + the code in font.c will need to be updated. */ + /* Internal implementation of w32font_list. Additional parameter opentype_only restricts the returned fonts to opentype fonts, which can be used with the Uniscribe backend. */ @@ -1455,22 +1542,34 @@ static int w32font_coverage_ok (FONTSIGNATURE * coverage, BYTE charset) { DWORD subrange1 = coverage->fsUsb[1]; + DWORD codepages0 = coverage->fsCsb[0]; #define SUBRANGE1_HAN_MASK 0x08000000 #define SUBRANGE1_HANGEUL_MASK 0x01000000 #define SUBRANGE1_JAPANESE_MASK (0x00060000 | SUBRANGE1_HAN_MASK) +#define SUBRANGE1_SIP_MASK 0x02000000 +/* We consider the coverage to be OK if either (a) subrange1 has the + bits set that correspond to CHARSET, or (b) subrange1 indicates SIP + support and codepages0 has one or more bits set corresponding to + CHARSET. */ if (charset == GB2312_CHARSET || charset == CHINESEBIG5_CHARSET) { - return (subrange1 & SUBRANGE1_HAN_MASK) == SUBRANGE1_HAN_MASK; + return ((subrange1 & SUBRANGE1_HAN_MASK) == SUBRANGE1_HAN_MASK + || ((subrange1 & SUBRANGE1_SIP_MASK) != 0 + && (codepages0 & CSB_CHINESE) != 0)); } else if (charset == SHIFTJIS_CHARSET) { - return (subrange1 & SUBRANGE1_JAPANESE_MASK) == SUBRANGE1_JAPANESE_MASK; + return ((subrange1 & SUBRANGE1_JAPANESE_MASK) == SUBRANGE1_JAPANESE_MASK + || ((subrange1 & SUBRANGE1_SIP_MASK) != 0 + && (codepages0 & CSB_JAPANESE) != 0)); } else if (charset == HANGEUL_CHARSET) { - return (subrange1 & SUBRANGE1_HANGEUL_MASK) == SUBRANGE1_HANGEUL_MASK; + return ((subrange1 & SUBRANGE1_HANGEUL_MASK) == SUBRANGE1_HANGEUL_MASK + || ((subrange1 & SUBRANGE1_SIP_MASK) != 0 + && (codepages0 & CSB_KOREAN) != 0)); } return 1; @@ -1620,11 +1719,18 @@ add_font_entity_to_list (ENUMLOGFONTEX *logical_font, } /* unicode-sip fonts must contain characters in Unicode plane 2. so look for bit 57 (surrogates) in the Unicode subranges, plus - the bits for CJK ranges that include those characters. */ + the bits for CJK ranges that include those characters or CJK + bits in code-page bit fields.. */ else if (EQ (spec_charset, Qunicode_sip)) { - if (!(physical_font->ntmFontSig.fsUsb[1] & 0x02000000) - || !(physical_font->ntmFontSig.fsUsb[1] & 0x28000000)) + if (!((physical_font->ntmFontSig.fsUsb[1] & 0x02000000) + && ((physical_font->ntmFontSig.fsUsb[1] & 0x28000000) + /* Some CJK fonts with very good coverage of SIP + characters have only the 0x02000000 bit in USB + set, so we allow them if their code-page bits + indicate support for CJK character sets. */ + || (physical_font->ntmFontSig.fsCsb[0] + & (CSB_CHINESE | CSB_JAPANESE | CSB_KOREAN))))) return 1; } @@ -2328,7 +2434,18 @@ font_supported_scripts (FONTSIGNATURE * sig) SUBRANGE (53, Qphags_pa); /* 54: Enclosed CJK letters and months, 55: CJK Compatibility. */ SUBRANGE (56, Qhangul); - /* 57: Surrogates. */ + /* 57: Non-BMP. Processed specially: Several fonts that support CJK + Ideographs Extensions and other extensions, set just this bit and + Latin, and nothing else. */ + if (subranges[57 / 32] & (1U << (57 % 32))) + { + if ((sig->fsCsb[0] & CSB_CHINESE)) + supported = Fcons (Qhan, supported); + if ((sig->fsCsb[0] & CSB_JAPANESE)) + supported = Fcons (Qkana, supported); + if ((sig->fsCsb[0] & CSB_KOREAN)) + supported = Fcons (Qhangul, supported); + } SUBRANGE (58, Qphoenician); SUBRANGE (59, Qhan); /* There are others, but this is the main one. */ SUBRANGE (59, Qideographic_description); /* Windows lumps this in. */ @@ -2385,7 +2502,7 @@ font_supported_scripts (FONTSIGNATURE * sig) SUBRANGE (97, Qglagolitic); SUBRANGE (98, Qtifinagh); /* 99: Yijing Hexagrams. */ - SUBRANGE (99, Qhan); + SUBRANGE (99, Qcjk_misc); SUBRANGE (100, Qsyloti_nagri); SUBRANGE (101, Qlinear_b); SUBRANGE (101, Qaegean_number); diff --git a/src/w32uniscribe.c b/src/w32uniscribe.c index 471bdf544d8..751963705d2 100644 --- a/src/w32uniscribe.c +++ b/src/w32uniscribe.c @@ -895,7 +895,7 @@ uniscribe_check_otf_1 (HDC context, Lisp_Object script, Lisp_Object lang, Lisp_Object features[2], int *retval) { SCRIPT_CACHE cache = NULL; - OPENTYPE_TAG tags[32], script_tag, lang_tag; + OPENTYPE_TAG tags[128], script_tag, lang_tag; int max_tags = ARRAYELTS (tags); int ntags, i, ret = 0; HRESULT rslt; -- 2.39.2