[U-Boot] [PATCH 2/2] efi_loader: Optimize GOP more
Heinrich Schuchardt
xypron.glpk at gmx.de
Fri Mar 16 10:55:23 UTC 2018
On 03/15/2018 03:02 PM, Alexander Graf wrote:
> The GOP path was optimized, but still not as fast as it should be. Let's
> push it even further by trimming the hot path into simple 32bit load/store
> operations for buf->vid 32bpp operations.
>
> Signed-off-by: Alexander Graf <agraf at suse.de>
> ---
> lib/efi_loader/efi_gop.c | 176 ++++++++++++++++++++++++++++++-----------------
> 1 file changed, 114 insertions(+), 62 deletions(-)
>
> diff --git a/lib/efi_loader/efi_gop.c b/lib/efi_loader/efi_gop.c
> index bbdf34e1dd..7b76e49ab0 100644
> --- a/lib/efi_loader/efi_gop.c
> +++ b/lib/efi_loader/efi_gop.c
> @@ -78,18 +78,20 @@ static inline u16 efi_blt_col_to_vid16(struct efi_gop_pixel *blt)
> }
>
> static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
> - struct efi_gop_pixel *buffer,
> + struct efi_gop_pixel *bufferp,
> u32 operation, efi_uintn_t sx,
> efi_uintn_t sy, efi_uintn_t dx,
> efi_uintn_t dy,
> efi_uintn_t width,
> efi_uintn_t height,
> - efi_uintn_t delta)
> + efi_uintn_t delta,
> + efi_uintn_t vid_bpp)
> {
> struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops);
> - efi_uintn_t i, j, linelen;
> + efi_uintn_t i, j, linelen, slineoff = 0, dlineoff, swidth, dwidth;
> u32 *fb32 = gopobj->fb;
> u16 *fb16 = gopobj->fb;
> + struct efi_gop_pixel *buffer = __builtin_assume_aligned(bufferp, 4);
>
> if (delta) {
> /* Check for 4 byte alignment */
> @@ -133,6 +135,37 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
> break;
> }
>
> + /* Calculate line width */
> + switch (operation) {
> + case EFI_BLT_BUFFER_TO_VIDEO:
> + swidth = linelen;
> + break;
> + case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> + case EFI_BLT_VIDEO_TO_VIDEO:
> + swidth = gopobj->info.width;
> + if (!vid_bpp)
> + return EFI_UNSUPPORTED;
> + break;
> + case EFI_BLT_VIDEO_FILL:
> + swidth = 0;
> + break;
> + }
> +
> + switch (operation) {
> + case EFI_BLT_BUFFER_TO_VIDEO:
> + case EFI_BLT_VIDEO_FILL:
> + case EFI_BLT_VIDEO_TO_VIDEO:
> + dwidth = gopobj->info.width;
> + if (!vid_bpp)
> + return EFI_UNSUPPORTED;
> + break;
> + case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> + dwidth = linelen;
> + break;
> + }
> +
> + slineoff = swidth * sy;
> + dlineoff = dwidth * dy;
> for (i = 0; i < height; i++) {
> for (j = 0; j < width; j++) {
> struct efi_gop_pixel pix;
> @@ -143,70 +176,65 @@ static __always_inline efi_status_t gop_blt_int(struct efi_gop *this,
> pix = *buffer;
> break;
> case EFI_BLT_BUFFER_TO_VIDEO:
> - pix = buffer[linelen * (i + sy) + j + sx];
> + pix = buffer[slineoff + j + sx];
> break;
> case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> case EFI_BLT_VIDEO_TO_VIDEO:
> - switch (gopobj->bpix) {
> -#ifdef CONFIG_DM_VIDEO
> - case VIDEO_BPP32:
> -#else
> - case LCD_COLOR32:
> -#endif
> + if (vid_bpp == 32)
> pix = *(struct efi_gop_pixel *)&fb32[
> - gopobj->info.width *
> - (i + sy) + j + sx];
> - break;
> -#ifdef CONFIG_DM_VIDEO
> - case VIDEO_BPP16:
> -#else
> - case LCD_COLOR16:
> -#endif
> + slineoff + j + sx];
> + else
> pix = efi_vid16_to_blt_col(fb16[
Shouldn't we eliminate this conversion for EFI_BLT_VIDEO_TO_VIDEO?
> - gopobj->info.width *
> - (i + sy) + j + sx]);
> - break;
> - default:
> - return EFI_UNSUPPORTED;
> - }
> + slineoff + j + sx]);
> break;
> }
>
> /* Write destination pixel */
> switch (operation) {
> case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> - buffer[linelen * (i + dy) + j + dx] = pix;
> + does buffer[dlineoff + j + dx] = pix;
> break;
> case EFI_BLT_BUFFER_TO_VIDEO:
> case EFI_BLT_VIDEO_FILL:
> case EFI_BLT_VIDEO_TO_VIDEO:
> - switch (gopobj->bpix) {
> + if (vid_bpp == 32)
> + fb32[dlineoff + j + dx] = *(u32 *)&pix;
> + else
> + fb16[dlineoff + j + dx] =
> + efi_blt_col_to_vid16(&pix);
Same here.
The following problem is not introduced by your patch series so it
should not stop you from merging the patches:
For EFI_BLT_VIDEO_TO_VIDEO the spec does not define how to copy
overlapping rectangles. The iteration direction makes a big difference here.
I think we should do overlapping copies always in a way that keeps the
contents of the source rectangle. Currently we corrupt it when
sy == dy && sx < dx < sx + width || sy < dy < dy + height.
For dy > sy we should iterate bottom to top. For dx > sy && dy == sy we
should iterate right to left.
Best regards
Heinrich
> + break;
> + }
> + }
> + slineoff += swidth;
> + dlineoff += dwidth;
> + }
> +
> + return EFI_SUCCESS;
> +}
> +
> +static efi_uintn_t gop_get_bpp(struct efi_gop *this)
> +{
> + struct efi_gop_obj *gopobj = container_of(this, struct efi_gop_obj, ops);
> + efi_uintn_t vid_bpp = 0;
> +
> + switch (gopobj->bpix) {
> #ifdef CONFIG_DM_VIDEO
> - case VIDEO_BPP32:
> + case VIDEO_BPP32:
> #else
> - case LCD_COLOR32:
> + case LCD_COLOR32:
> #endif
> - fb32[gopobj->info.width *
> - (i + dy) + j + dx] = *(u32 *)&pix;
> - break;
> + vid_bpp = 32;
> + break;
> #ifdef CONFIG_DM_VIDEO
> - case VIDEO_BPP16:
> + case VIDEO_BPP16:
> #else
> - case LCD_COLOR16:
> + case LCD_COLOR16:
> #endif
> - fb16[gopobj->info.width *
> - (i + dy) + j + dx] =
> - efi_blt_col_to_vid16(&pix);
> - break;
> - default:
> - return EFI_UNSUPPORTED;
> - }
> - break;
> - }
> - }
> + vid_bpp = 16;
> + break;
> }
>
> - return EFI_SUCCESS;
> + return vid_bpp;
> }
>
> /*
> @@ -223,21 +251,33 @@ static efi_status_t gop_blt_video_fill(struct efi_gop *this,
> u32 foo, efi_uintn_t sx,
> efi_uintn_t sy, efi_uintn_t dx,
> efi_uintn_t dy, efi_uintn_t width,
> - efi_uintn_t height, efi_uintn_t delta)
> + efi_uintn_t height, efi_uintn_t delta,
> + efi_uintn_t vid_bpp)
> {
> return gop_blt_int(this, buffer, EFI_BLT_VIDEO_FILL, sx, sy, dx,
> - dy, width, height, delta);
> + dy, width, height, delta, vid_bpp);
> }
>
> -static efi_status_t gop_blt_buf_to_vid(struct efi_gop *this,
> - struct efi_gop_pixel *buffer,
> - u32 foo, efi_uintn_t sx,
> - efi_uintn_t sy, efi_uintn_t dx,
> - efi_uintn_t dy, efi_uintn_t width,
> - efi_uintn_t height, efi_uintn_t delta)
> +static efi_status_t gop_blt_buf_to_vid16(struct efi_gop *this,
> + struct efi_gop_pixel *buffer,
> + u32 foo, efi_uintn_t sx,
> + efi_uintn_t sy, efi_uintn_t dx,
> + efi_uintn_t dy, efi_uintn_t width,
> + efi_uintn_t height, efi_uintn_t delta)
> {
> return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx,
> - dy, width, height, delta);
> + dy, width, height, delta, 16);
> +}
> +
> +static efi_status_t gop_blt_buf_to_vid32(struct efi_gop *this,
> + struct efi_gop_pixel *buffer,
> + u32 foo, efi_uintn_t sx,
> + efi_uintn_t sy, efi_uintn_t dx,
> + efi_uintn_t dy, efi_uintn_t width,
> + efi_uintn_t height, efi_uintn_t delta)
> +{
> + return gop_blt_int(this, buffer, EFI_BLT_BUFFER_TO_VIDEO, sx, sy, dx,
> + dy, width, height, delta, 32);
> }
>
> static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this,
> @@ -245,10 +285,11 @@ static efi_status_t gop_blt_vid_to_vid(struct efi_gop *this,
> u32 foo, efi_uintn_t sx,
> efi_uintn_t sy, efi_uintn_t dx,
> efi_uintn_t dy, efi_uintn_t width,
> - efi_uintn_t height, efi_uintn_t delta)
> + efi_uintn_t height, efi_uintn_t delta,
> + efi_uintn_t vid_bpp)
> {
> return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_VIDEO, sx, sy, dx,
> - dy, width, height, delta);
> + dy, width, height, delta, vid_bpp);
> }
>
> static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this,
> @@ -256,10 +297,11 @@ static efi_status_t gop_blt_vid_to_buf(struct efi_gop *this,
> u32 foo, efi_uintn_t sx,
> efi_uintn_t sy, efi_uintn_t dx,
> efi_uintn_t dy, efi_uintn_t width,
> - efi_uintn_t height, efi_uintn_t delta)
> + efi_uintn_t height, efi_uintn_t delta,
> + efi_uintn_t vid_bpp)
> {
> return gop_blt_int(this, buffer, EFI_BLT_VIDEO_TO_BLT_BUFFER, sx, sy,
> - dx, dy, width, height, delta);
> + dx, dy, width, height, delta, vid_bpp);
> }
>
> /*
> @@ -287,27 +329,37 @@ efi_status_t EFIAPI gop_blt(struct efi_gop *this, struct efi_gop_pixel *buffer,
> efi_uintn_t height, efi_uintn_t delta)
> {
> efi_status_t ret = EFI_INVALID_PARAMETER;
> + efi_uintn_t vid_bpp;
>
> EFI_ENTRY("%p, %p, %u, %zu, %zu, %zu, %zu, %zu, %zu, %zu", this,
> buffer, operation, sx, sy, dx, dy, width, height, delta);
>
> + vid_bpp = gop_get_bpp(this);
> +
> /* Allow for compiler optimization */
> switch (operation) {
> case EFI_BLT_VIDEO_FILL:
> ret = gop_blt_video_fill(this, buffer, operation, sx, sy, dx,
> - dy, width, height, delta);
> + dy, width, height, delta, vid_bpp);
> break;
> case EFI_BLT_BUFFER_TO_VIDEO:
> - ret = gop_blt_buf_to_vid(this, buffer, operation, sx, sy, dx,
> - dy, width, height, delta);
> + /* This needs to be super-fast, so duplicate for 16/32bpp */
> + if (vid_bpp == 32)
> + ret = gop_blt_buf_to_vid32(this, buffer, operation, sx,
> + sy, dx, dy, width, height,
> + delta);
> + else
> + ret = gop_blt_buf_to_vid16(this, buffer, operation, sx,
> + sy, dx, dy, width, height,
> + delta);
> break;
> case EFI_BLT_VIDEO_TO_VIDEO:
> ret = gop_blt_vid_to_vid(this, buffer, operation, sx, sy, dx,
> - dy, width, height, delta);
> + dy, width, height, delta, vid_bpp);
> break;
> case EFI_BLT_VIDEO_TO_BLT_BUFFER:
> ret = gop_blt_vid_to_buf(this, buffer, operation, sx, sy, dx,
> - dy, width, height, delta);
> + dy, width, height, delta, vid_bpp);
> break;
> default:
> ret = EFI_UNSUPPORTED;
>
More information about the U-Boot
mailing list