[PATCH 8/8] video: Convert UTF-8 input stream to the 437 code page

Mon Jan 10 09:18:45 CET 2022

/On 1/10/22 01:56, Andre Przywara wrote:
> The bitmap fonts (VGA 8x16 and friends) we import from Linux use the
> 437 code page to map their glyphs. For U-Boot's own purposes this is
> probably fine, but UEFI applications output Unicode, which only matches
> in the very basic first 127 characters.
> 
> Add a function that converts UTF-8 character sequences into the
> respective CP437 code point, as far as the characters defined in there
> allow this. This includes quite some international and box drawing
> characters, which are used by UEFI applications.
> 
> Signed-off-by: Andre Przywara <andre.przywara at arm.com>
> ---
>   drivers/video/Makefile            |   1 +
>   drivers/video/utf8_cp437.c        | 169 ++++++++++++++++++++++++++++++
>   drivers/video/vidconsole-uclass.c |   6 +-
>   include/video_console.h           |   9 ++
>   4 files changed, 184 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/video/utf8_cp437.c
> 
> diff --git a/drivers/video/Makefile b/drivers/video/Makefile
> index 8956b5f9b00..5f9823dff9e 100644
> --- a/drivers/video/Makefile
> +++ b/drivers/video/Makefile
> @@ -14,6 +14,7 @@ obj-$(CONFIG_DISPLAY) += display-uclass.o
>   obj-$(CONFIG_VIDEO_MIPI_DSI) += dsi-host-uclass.o
>   obj-$(CONFIG_DM_VIDEO) += video-uclass.o vidconsole-uclass.o
>   obj-$(CONFIG_DM_VIDEO) += video_bmp.o
> +obj-$(CONFIG_DM_VIDEO) += utf8_cp437.o
>   obj-$(CONFIG_PANEL) += panel-uclass.o
>   obj-$(CONFIG_DM_PANEL_HX8238D) += hx8238d.o
>   obj-$(CONFIG_SIMPLE_PANEL) += simple_panel.o
> diff --git a/drivers/video/utf8_cp437.c b/drivers/video/utf8_cp437.c
> new file mode 100644
> index 00000000000..cab68b92b6e
> --- /dev/null
> +++ b/drivers/video/utf8_cp437.c

A translation from Unicode to CP437 is needed in the FAT driver (but 
missing), in Unicode Collation Protocol and here in video. So this 
functionality should live in lib/charset.c.

Please, have a look at efi_fat_to_str() in 
lib/efi_loader/efi_unicode_collation.c. We should avoid code duplication.

Maybe we should drop CP1250 support to make our effort simpler? No 
defconfig uses it.

> @@ -0,0 +1,169 @@
> +/*
> + * Convert UTF-8 bytes into a code page 437 character.
> + * Based on the table in the Code_page_437 Wikipedia page.
> + */
> +
> +#include <linux/types.h>
> +
> +uint8_t code_points_00a0[] = {
> +	255, 173, 155, 156,   7, 157,   7,  21,
> +	  7,   7, 166, 174, 170,   7,   7,   7,
> +	248, 241, 253,   7,   7, 230,  20, 250,
> +	  7,   7, 167, 175, 172, 171,   7, 168,
> +	  7,   7,   7,   7, 142, 143, 146, 128,
> +	  7, 144,   7,   7,   7,   7,   7,   7,
> +	  7, 165,   7,   7,   7,   7, 153,   7,
> +	  7,   7,   7,   7, 154,   7,   7, 225,
> +	133, 160, 131,   7, 132, 134, 145, 135,
> +	138, 130, 136, 137, 141, 161, 140, 139,
> +	  7, 164, 149, 162, 147,   7, 148, 246,
> +	  7, 151, 163, 150, 129,   7,   7, 152,
> +};
> +
> +uint8_t code_points_2550[] = {
> +	205, 186, 213, 214, 201, 184, 183, 187,
> +	212, 211, 200, 190, 189, 188, 198, 199,
> +	204, 181, 182, 185, 209, 210, 203, 207,
> +	208, 202, 216, 215, 206
> +};
> +
> +static uint8_t utf8_convert_11bit(uint16_t code)
> +{
> +	switch (code) {
> +	case 0x0192: return 159;
> +	case 0x0393: return 226;
> +	case 0x0398: return 233;
> +	case 0x03A3: return 228;
> +	case 0x03A6: return 232;
> +	case 0x03A9: return 234;
> +	case 0x03B1: return 224;
> +	case 0x03B4: return 235;
> +	case 0x03B5: return 238;
> +	case 0x03C0: return 227;
> +	case 0x03C3: return 229;
> +	case 0x03C4: return 231;
> +	case 0x03C6: return 237;
> +	}
> +
> +	return 0;
> +};
> +
> +static uint8_t utf8_convert_2xxx(uint16_t code)

This is duplicate to include/cp437.h

> +{
> +	switch (code) {
> +	case 0x2022: return 7;
> +	case 0x203C: return 19;
> +	case 0x207F: return 252;
> +	case 0x20A7: return 158;
> +	case 0x2190: return 27;
> +	case 0x2191: return 24;
> +	case 0x2192: return 26;
> +	case 0x2193: return 25;
> +	case 0x2194: return 29;
> +	case 0x2195: return 18;
> +	case 0x21A8: return 23;
> +	case 0x2219: return 249;
> +	case 0x221A: return 251;
> +	case 0x221E: return 236;
> +	case 0x221F: return 28;
> +	case 0x2229: return 239;
> +	case 0x2248: return 247;
> +	case 0x2261: return 240;
> +	case 0x2264: return 243;
> +	case 0x2265: return 242;
> +	case 0x2310: return 169;
> +	case 0x2320: return 244;
> +	case 0x2321: return 245;
> +	case 0x2500: return 196;
> +	case 0x2502: return 179;
> +	case 0x250C: return 218;
> +	case 0x2510: return 191;
> +	case 0x2514: return 192;
> +	case 0x2518: return 217;
> +	case 0x251C: return 195;
> +	case 0x2524: return 180;
> +	case 0x252C: return 194;
> +	case 0x2534: return 193;
> +	case 0x253C: return 197;
> +	case 0x2580: return 223;
> +	case 0x2584: return 220;
> +	case 0x2588: return 219;
> +	case 0x258C: return 221;
> +	case 0x2590: return 222;
> +	case 0x2591: return 176;
> +	case 0x2592: return 177;
> +	case 0x2593: return 178;
> +	case 0x25A0: return 254;
> +	case 0x25AC: return 22;
> +	case 0x25B2: return 30;
> +	case 0x25BA: return 16;
> +	case 0x25BC: return 31;
> +	case 0x25C4: return 17;
> +	case 0x25CB: return 9;
> +	case 0x25D8: return 8;
> +	case 0x25D9: return 10;
> +	case 0x263A: return 1;
> +	case 0x263B: return 2;
> +	case 0x263C: return 15;
> +	case 0x2640: return 12;
> +	case 0x2642: return 11;
> +	case 0x2660: return 6;
> +	case 0x2663: return 5;
> +	case 0x2665: return 3;
> +	case 0x2666: return 4;
> +	case 0x266A: return 13;
> +	case 0x266B: return 14;
> +	}
> +
> +	return 0;
> +}
> +
> +uint8_t convert_uc16_to_cp437(uint16_t code)

We should not duplicate efi_fat_to_str() but use a common function.

> +{
> +	if (code < 0x7f)		// ASCII
> +		return code;
> +	if (code < 0xa0)		// high control characters
> +		return code;
> +	if (code < 0x100)		// international characters
> +		return code_points_00a0[code - 0xa0];
> +	if (code < 0x800)
> +		return utf8_convert_11bit(code);
> +	if (code >= 0x2550 && code < 0x256d)	// block graphics
> +		return code_points_2550[code - 0x2550];

How about ÄÖÜß and other European letters?

> +
> +	return utf8_convert_2xxx(code);
> +}
> +
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc)
> +{
> +	int shift;
> +	uint16_t ucs;
> +
> +	if (c < 127)			// ASCII
> +		return c;
> +	if (c == 127)
> +		return 8;		// DEL (?)
> +
> +	switch (c & 0xf0) {
> +	case 0xc0: case 0xd0:		// two bytes sequence
> +		*esc = (1U << 24) | ((c & 0x1f) << 6);
> +		return 0;
> +	case 0xe0:			// three bytes sequence
> +		*esc = (2U << 24) | ((c & 0x0f) << 12);
> +		return 0;
> +	case 0xf0:			// four bytes sequence
> +		*esc = (3U << 24) | ((c & 0x07) << 18);
> +		return 0;
> +	case 0x80: case 0x90: case 0xa0: case 0xb0:	// continuation
> +		shift = (*esc >> 24) - 1;
> +		ucs = *esc & 0xffffff;
> +		if (shift) {
> +			*esc = (shift << 24) | ucs | (c & 0x3f) << (shift * 6);
> +			return 0;
> +		}
> +		*esc = 0;
> +		return convert_uc16_to_cp437(ucs | (c & 0x3f));
> +	}
> +
> +	return 0;
> +}
> diff --git a/drivers/video/vidconsole-uclass.c b/drivers/video/vidconsole-uclass.c
> index 420fd86f9ac..ca6e1a2620c 100644
> --- a/drivers/video/vidconsole-uclass.c
> +++ b/drivers/video/vidconsole-uclass.c
> @@ -546,6 +546,7 @@ static int vidconsole_output_glyph(struct udevice *dev, char ch)
>   int vidconsole_put_char(struct udevice *dev, char ch)
>   {
>   	struct vidconsole_priv *priv = dev_get_uclass_priv(dev);
> +	uint8_t cp437;
>   	int ret;
>   
>   	/*
> @@ -587,7 +588,10 @@ int vidconsole_put_char(struct udevice *dev, char ch)
>   		priv->last_ch = 0;
>   		break;
>   	default:
> -		ret = vidconsole_output_glyph(dev, ch);
> +		cp437 = convert_utf8_to_cp437(ch, &priv->ucs);
> +		if (cp437 == 0)
> +			return 0;
> +		ret = vidconsole_output_glyph(dev, cp437);
>   		if (ret < 0)
>   			return ret;
>   		break;
> diff --git a/include/video_console.h b/include/video_console.h

This should go to include/charset.h.

Best regards

Heinrich

> index a908f1412e8..f2d05e7f4e7 100644
> --- a/include/video_console.h
> +++ b/include/video_console.h
> @@ -83,6 +83,7 @@ struct vidconsole_priv {
>   	int escape_len;
>   	int row_saved;
>   	int col_saved;
> +	u32 ucs;
>   	bool cursor_visible;
>   	char escape_buf[32];
>   };
> @@ -304,6 +305,14 @@ static inline int vidconsole_memmove(struct udevice *dev, void *dst,
>   	return 0;
>   }
>   
> +/*
> + * Convert an UTF-8 byte into the corresponding character in the CP437
> + * code page. Returns 0 if that character is part of a multi-byte sequence.
> + * for which *esc holds the state of. Repeatedly feed in more bytes until
> + * the return value returns a non-0 character.

Please, follow the style described in 
https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html#function-documentation. 
This will allow inclusion in the HTML documentation (see 
https://u-boot.readthedocs.io/en/latest/api/index.html).

Best regards

Heinrich

> + */
> +uint8_t convert_utf8_to_cp437(uint8_t c, uint32_t *esc);
> +
>   #endif
>   
>   #endif