[PATCH 8/8] video: Convert UTF-8 input stream to the 437 code page
Heinrich Schuchardt
heinrich.schuchardt at canonical.com
Mon Jan 10 09:18:45 CET 2022
/On 1/10/22 01:56, Andre Przywara wrote:
> The bitmap fonts (VGA 8x16 and friends) we import from Linux use the
> 437 code page to map their glyphs. For U-Boot's own purposes this is
> probably fine, but UEFI applications output Unicode, which only matches
> in the very basic first 127 characters.
>
> Add a function that converts UTF-8 character sequences into the
> respective CP437 code point, as far as the characters defined in there
> allow this. This includes quite some international and box drawing
> characters, which are used by UEFI applications.
>
> Signed-off-by: Andre Przywara <andre.przywara at arm.com>
> ---
> drivers/video/Makefile | 1 +
> drivers/video/utf8_cp437.c | 169 ++++++++++++++++++++++++++++++
> drivers/video/vidconsole-uclass.c | 6 +-
> include/video_console.h | 9 ++
> 4 files changed, 184 insertions(+), 1 deletion(-)
> create mode 100644 drivers/video/utf8_cp437.c
>
> diff --git a/drivers/video/Makefile b/drivers/video/Makefile
> index 8956b5f9b00..5f9823dff9e 100644
> --- a/drivers/video/Makefile
> +++ b/drivers/video/Makefile
> @@ -14,6 +14,7 @@ obj-$(CONFIG_DISPLAY) += display-uclass.o
> obj-$(CONFIG_VIDEO_MIPI_DSI) += dsi-host-uclass.o
> obj-$(CONFIG_DM_VIDEO) += video-uclass.o vidconsole-uclass.o
> obj-$(CONFIG_DM_VIDEO) += video_bmp.o
> +obj-$(CONFIG_DM_VIDEO) += utf8_cp437.o
> obj-$(CONFIG_PANEL) += panel-uclass.o
> obj-$(CONFIG_DM_PANEL_HX8238D) += hx8238d.o
> obj-$(CONFIG_SIMPLE_PANEL) += simple_panel.o
> diff --git a/drivers/video/utf8_cp437.c b/drivers/video/utf8_cp437.c
> new file mode 100644
> index 00000000000..cab68b92b6e
> --- /dev/null
> +++ b/drivers/video/utf8_cp437.c
A translation from Unicode to CP437 is needed in the FAT driver (but
missing), in Unicode Collation Protocol and here in video. So this
functionality should live in lib/charset.c.
Please, have a look at efi_fat_to_str() in
lib/efi_loader/efi_unicode_collation.c. We should avoid code duplication.
Maybe we should drop CP1250 support to make our effort simpler? No
defconfig uses it.
> @@ -0,0 +1,169 @@
> +/*
> + * Convert UTF-8 bytes into a code page 437 character.
> + * Based on the table in the Code_page_437 Wikipedia page.
> + */
> +
> +#include <linux/types.h>
> +
> +uint8_t code_points_00a0[] = {
> + 255, 173, 155, 156, 7, 157, 7, 21,
> + 7, 7, 166, 174, 170, 7, 7, 7,
> + 248, 241, 253, 7, 7, 230, 20, 250,
> + 7, 7, 167, 175, 172, 171, 7, 168,
> + 7, 7, 7, 7, 142, 143, 146, 128,
> + 7, 144, 7, 7, 7, 7, 7, 7,
> + 7, 165, 7, 7, 7, 7, 153, 7,
> + 7, 7, 7, 7, 154, 7, 7, 225,
> + 133, 160, 131, 7, 132, 134, 145, 135,
> + 138, 130, 136, 137, 141, 161, 140, 139,
> + 7, 164, 149, 162, 147, 7, 148, 246,
> + 7, 151, 163, 150, 129, 7, 7, 152,
> +};
> +
> +uint8_t code_points_2550[] = {
> + 205, 186, 213, 214, 201, 184, 183, 187,
> + 212, 211, 200, 190, 189, 188, 198, 199,
> + 204, 181, 182, 185, 209, 210, 203, 207,
> + 208, 202, 216, 215, 206
> +};
> +
> +static uint8_t utf8_convert_11bit(uint16_t code)
> +{
> + switch (code) {
> + case 0x0192: return 159;
> + case 0x0393: return 226;
> + case 0x0398: return 233;
> + case 0x03A3: return 228;
> + case 0x03A6: return 232;
> + case 0x03A9: return 234;
> + case 0x03B1: return 224;
> + case 0x03B4: return 235;
> + case 0x03B5: return 238;
> + case 0x03C0: return 227;
> + case 0x03C3: return 229;
> + case 0x03C4: return 231;
> + case 0x03C6: return 237;
> + }
> +
> + return 0;
> +};
> +
> +static uint8_t utf8_convert_2xxx(uint16_t code)
This is duplicate to include/cp437.h
> +{
> + switch (code) {
> + case 0x2022: return 7;
> + case 0x203C: return 19;
> + case 0x207F: return 252;
> + case 0x20A7: return 158;
> + case 0x2190: return 27;
> + case 0x2191: return 24;
> + case 0x2192: return 26;
> + case 0x2193: return 25;
> + case 0x2194: return 29;
> + case 0x2195: return 18;
> + case 0x21A8: return 23;
> + case 0x2219: return 249;
> + case 0x221A: return 251;
> + case 0x221E: return 236;
> + case 0x221F: return 28;
> + case 0x2229: return 239;
> + case 0x2248: return 247;
> + case 0x2261: return 240;
> + case 0x2264: return 243;
> + case 0x2265: return 242;
> + case 0x2310: return 169;
> + case 0x2320: return 244;
> + case 0x2321: return 245;
> + case 0x2500: return 196;
> + case 0x2502: return 179;
> + case 0x250C: return 218;
> + case 0x2510: return 191;
> + case 0x2514: return 192;
> + case 0x2518: return 217;
> + case 0x251C: return 195;
> + case 0x2524: return 180;
> + case 0x252C: return 194;
> + case 0x2534: return 193;
> + case 0x253C: return 197;
> + case 0x2580: return 223;
> + case 0x2584: return 220;
> + case 0x2588: return 219;
> + case 0x258C: return 221;
> + case 0x2590: return 222;
> + case 0x2591: return 176;
> + case 0x2592: return 177;
> + case 0x2593: return 178;
> + case 0x25A0: return 254;
> + case 0x25AC: return 22;
> + case 0x25B2: return 30;
> + case 0x25BA: return 16;
> + case 0x25BC: return 31;
> + case 0x25C4: return 17;
> + case 0x25CB: return 9;
> + case 0x25D8: return 8;
> + case 0x25D9: return 10;
> + case 0x263A: return 1;
> + case 0x263B: return 2;
> + case 0x263C: return 15;
> + case 0x2640: return 12;
> + case 0x2642: return 11;
> + case 0x2660: return 6;
> + case 0x2663: return 5;
> + case 0x2665: return 3;
> + case 0x2666: return 4;
> + case 0x266A: return 13;
> + case 0x266B: return 14;
> + }
> +
> + return 0;
> +}
> +
> +uint8_t convert_uc16_to_cp437(uint16_t code)
We should not duplicate efi_fat_to_str() but use a common function.
> +{
> + if (code < 0x7f) // ASCII
> + return code;
> + if (code < 0xa0) // high control characters
> + return code;
> + if (code < 0x100) // international characters
> + return code_points_00a0[code - 0xa0];
> + if (code < 0x800)
> + return utf8_convert_11bit(code);
> + if (code >= 0x2550 && code < 0x256d) // block graphics
> + return code_points_2550[code - 0x2550];
How about ÄÖÜß and other European letters?
> +
> + return utf8_convert_2xxx(code);
> +}
> +
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc)
> +{
> + int shift;
> + uint16_t ucs;
> +
> + if (c < 127) // ASCII
> + return c;
> + if (c == 127)
> + return 8; // DEL (?)
> +
> + switch (c & 0xf0) {
> + case 0xc0: case 0xd0: // two bytes sequence
> + *esc = (1U << 24) | ((c & 0x1f) << 6);
> + return 0;
> + case 0xe0: // three bytes sequence
> + *esc = (2U << 24) | ((c & 0x0f) << 12);
> + return 0;
> + case 0xf0: // four bytes sequence
> + *esc = (3U << 24) | ((c & 0x07) << 18);
> + return 0;
> + case 0x80: case 0x90: case 0xa0: case 0xb0: // continuation
> + shift = (*esc >> 24) - 1;
> + ucs = *esc & 0xffffff;
> + if (shift) {
> + *esc = (shift << 24) | ucs | (c & 0x3f) << (shift * 6);
> + return 0;
> + }
> + *esc = 0;
> + return convert_uc16_to_cp437(ucs | (c & 0x3f));
> + }
> +
> + return 0;
> +}
> diff --git a/drivers/video/vidconsole-uclass.c b/drivers/video/vidconsole-uclass.c
> index 420fd86f9ac..ca6e1a2620c 100644
> --- a/drivers/video/vidconsole-uclass.c
> +++ b/drivers/video/vidconsole-uclass.c
> @@ -546,6 +546,7 @@ static int vidconsole_output_glyph(struct udevice *dev, char ch)
> int vidconsole_put_char(struct udevice *dev, char ch)
> {
> struct vidconsole_priv *priv = dev_get_uclass_priv(dev);
> + uint8_t cp437;
> int ret;
>
> /*
> @@ -587,7 +588,10 @@ int vidconsole_put_char(struct udevice *dev, char ch)
> priv->last_ch = 0;
> break;
> default:
> - ret = vidconsole_output_glyph(dev, ch);
> + cp437 = convert_utf8_to_cp437(ch, &priv->ucs);
> + if (cp437 == 0)
> + return 0;
> + ret = vidconsole_output_glyph(dev, cp437);
> if (ret < 0)
> return ret;
> break;
> diff --git a/include/video_console.h b/include/video_console.h
This should go to include/charset.h.
Best regards
Heinrich
> index a908f1412e8..f2d05e7f4e7 100644
> --- a/include/video_console.h
> +++ b/include/video_console.h
> @@ -83,6 +83,7 @@ struct vidconsole_priv {
> int escape_len;
> int row_saved;
> int col_saved;
> + u32 ucs;
> bool cursor_visible;
> char escape_buf[32];
> };
> @@ -304,6 +305,14 @@ static inline int vidconsole_memmove(struct udevice *dev, void *dst,
> return 0;
> }
>
> +/*
> + * Convert an UTF-8 byte into the corresponding character in the CP437
> + * code page. Returns 0 if that character is part of a multi-byte sequence.
> + * for which *esc holds the state of. Repeatedly feed in more bytes until
> + * the return value returns a non-0 character.
Please, follow the style described in
https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html#function-documentation.
This will allow inclusion in the HTML documentation (see
https://u-boot.readthedocs.io/en/latest/api/index.html).
Best regards
Heinrich
> + */
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc);
> +
> #endif
>
> #endif
More information about the U-Boot
mailing list