diff --git a/src/common/api.c b/src/common/api.c index 9e370c9..70ad2c5 100644 --- a/src/common/api.c +++ b/src/common/api.c @@ -13,8 +13,14 @@ #include #include #include +#include #include + +#include "ion.h" +#include "ion-owl.h" +#include "de_atm7059.h" + #include "api.h" #include "utils.h" #include "defines.h" @@ -27,7 +33,6 @@ void LOG_note(int level, const char* fmt, ...) { va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - switch(level) { #ifdef DEBUG case LOG_DEBUG: @@ -51,6 +56,125 @@ void LOG_note(int level, const char* fmt, ...) { /////////////////////////////// +typedef struct ion_alloc_info { + uint32_t size; + struct ion_handle *handle; + int fd; + void* padd; + void* vadd; +} ion_alloc_info_t; +static void ion_alloc(int fd_ion, ion_alloc_info_t* info) { + struct ion_allocation_data iad; + struct ion_fd_data ifd; + struct ion_custom_data icd; + struct owl_ion_phys_data ipd; + + iad.len = info->size; + iad.align = sysconf(_SC_PAGESIZE); + iad.heap_id_mask = (1<handle = (void*)iad.handle; + info->fd = ifd.fd; + info->padd = (void*)ipd.phys_addr; + info->vadd = mmap(0, info->size, PROT_READ|PROT_WRITE, MAP_SHARED, info->fd, 0); + +} +static void ion_free(int fd_ion, ion_alloc_info_t* info) { + struct ion_handle_data ihd; + munmap(info->vadd, info->size); + ihd.handle = (uintptr_t)info->handle; + if (ioctl(fd_ion, ION_IOC_FREE, &ihd)<0) fprintf(stderr, "ION_FREE failed %s\n",strerror(errno)); + fflush(stdout); +} + +/////////////////////////////// + +#define DE (0xB02E0000) +#define DE_SIZE (0x00002000) +enum { + DE_SCOEF_NONE, + DE_SCOEF_CRISPY, + DE_SCOEF_ZOOMIN, + DE_SCOEF_HALF_ZOOMOUT, + DE_SCOEF_SMALLER_ZOOMOUT, + DE_SCOEF_MAX +}; +static void DE_setScaleCoef(uint32_t* de_mem, int plane, int scale) { + switch(scale) { + case DE_SCOEF_NONE: // for integer scale < L R > (0x40=100%) Applies to the following pixels: + de_mem[DE_OVL_SCOEF0(plane)/4]= 0x00400000; // L 100% R 0% + de_mem[DE_OVL_SCOEF1(plane)/4]= 0x00400000; // L 87.5% R 12.5% + de_mem[DE_OVL_SCOEF2(plane)/4]= 0x00400000; // L 75% R 25% + de_mem[DE_OVL_SCOEF3(plane)/4]= 0x00400000; // L 62.5% R 37.5% + de_mem[DE_OVL_SCOEF4(plane)/4]= 0x00004000; // L 50% R 50% + de_mem[DE_OVL_SCOEF5(plane)/4]= 0x00004000; // L 37.5% R 62.5% + de_mem[DE_OVL_SCOEF6(plane)/4]= 0x00004000; // L 25% R 75% + de_mem[DE_OVL_SCOEF7(plane)/4]= 0x00004000; // L 12.5% R 87.5% + break; + case DE_SCOEF_CRISPY: // crispy setting for upscale + de_mem[DE_OVL_SCOEF0(plane)/4]= 0x00400000; + de_mem[DE_OVL_SCOEF1(plane)/4]= 0x00400000; + de_mem[DE_OVL_SCOEF2(plane)/4]= 0x00400000; + de_mem[DE_OVL_SCOEF3(plane)/4]= 0x00400000; + de_mem[DE_OVL_SCOEF4(plane)/4]= 0x00202000; + de_mem[DE_OVL_SCOEF5(plane)/4]= 0x00004000; + de_mem[DE_OVL_SCOEF6(plane)/4]= 0x00004000; + de_mem[DE_OVL_SCOEF7(plane)/4]= 0x00004000; + break; + case DE_SCOEF_ZOOMIN: + de_mem[DE_OVL_SCOEF0(plane)/4]= 0x00400000; + de_mem[DE_OVL_SCOEF1(plane)/4]= 0xFC3E07FF; + de_mem[DE_OVL_SCOEF2(plane)/4]= 0xFA3810FE; + de_mem[DE_OVL_SCOEF3(plane)/4]= 0xF9301BFC; + de_mem[DE_OVL_SCOEF4(plane)/4]= 0xFA2626FA; + de_mem[DE_OVL_SCOEF5(plane)/4]= 0xFC1B30F9; + de_mem[DE_OVL_SCOEF6(plane)/4]= 0xFE1038FA; + de_mem[DE_OVL_SCOEF7(plane)/4]= 0xFF073EFC; + break; + case DE_SCOEF_HALF_ZOOMOUT: + de_mem[DE_OVL_SCOEF0(plane)/4]= 0x00400000; + de_mem[DE_OVL_SCOEF1(plane)/4]= 0x00380800; + de_mem[DE_OVL_SCOEF2(plane)/4]= 0x00301000; + de_mem[DE_OVL_SCOEF3(plane)/4]= 0x00281800; + de_mem[DE_OVL_SCOEF4(plane)/4]= 0x00202000; + de_mem[DE_OVL_SCOEF5(plane)/4]= 0x00182800; + de_mem[DE_OVL_SCOEF6(plane)/4]= 0x00103000; + de_mem[DE_OVL_SCOEF7(plane)/4]= 0x00083800; + break; + case DE_SCOEF_SMALLER_ZOOMOUT: + de_mem[DE_OVL_SCOEF0(plane)/4]= 0x10201000; + de_mem[DE_OVL_SCOEF1(plane)/4]= 0x0E1E1202; + de_mem[DE_OVL_SCOEF2(plane)/4]= 0x0C1C1404; + de_mem[DE_OVL_SCOEF3(plane)/4]= 0x0A1A1606; + de_mem[DE_OVL_SCOEF4(plane)/4]= 0x08181808; + de_mem[DE_OVL_SCOEF5(plane)/4]= 0x06161A0A; + de_mem[DE_OVL_SCOEF6(plane)/4]= 0x04141C0C; + de_mem[DE_OVL_SCOEF7(plane)/4]= 0x02121E0E; + break; + default: + break; + } +} +static void DE_enableLayer(uint32_t* de_mem) { + de_mem[DE_PATH_CTL(0)/4] = 0x30100000 | (de_mem[DE_PATH_CTL(0)/4] & 0xCF0FFFFF); +} +static void DE_setRect(uint32_t* de_mem, int x, int y, int w, int h) { + de_mem[(DE_OVL_OSIZE(0))/4] = ((w-1)&0xFFFF) | ((h-1)<<16); + de_mem[(DE_OVL_SR(0))/4] = ((0x2000*((de_mem[(DE_OVL_ISIZE(0))/4]&0xFFFF)+1)/w)&0xFFFF) | + ((0x2000*((de_mem[(DE_OVL_ISIZE(0))/4]>>16)+1)/h)<<16); + de_mem[(DE_OVL_COOR(0,0))/4] = (y<<16) | (x&0xFFFF); +} + +/////////////////////////////// + #define MAX_PRIVATE_DATA_SIZE 40 struct owlfb_disp_device{ __u32 mType; @@ -166,20 +290,26 @@ uint32_t RGB_GRAY; uint32_t RGB_DARK_GRAY; static struct GFX_Context { - int fb0_fd; - int page; - int resized; - - int mode; - int vsync; - - struct fb_fix_screeninfo finfo; - struct fb_var_screeninfo vinfo; - - void* fb0_buffer; - SDL_Surface* screen; SDL_Surface* assets; + + int mode; + int vsync; + + int fd_fb; + int fd_ion; + int fd_mem; + + uint32_t* de_mem; + + struct fb_fix_screeninfo finfo; + struct fb_var_screeninfo vinfo; + ion_alloc_info_t fb_info; + + int page; + int width; + int height; + int pitch; } gfx; static SDL_Rect asset_rects[] = { @@ -216,7 +346,6 @@ GFX_Fonts font; static struct POW_Context { int can_poweroff; - int previous_speed; // TODO: unused int can_autosleep; pthread_t battery_pt; @@ -233,51 +362,49 @@ static struct POW_Context { /////////////////////////////// static int _; + SDL_Surface* GFX_init(int mode) { SDL_Init(SDL_INIT_VIDEO); SDL_ShowCursor(0); - TTF_Init(); + SDL_SetVideoMode(0,0,FIXED_DEPTH,0); - ////////////////////////////// + //////// - SDL_SetVideoMode(FIXED_WIDTH, FIXED_HEIGHT, FIXED_DEPTH, SDL_SWSURFACE); + gfx.fd_fb = open("/dev/fb0", O_RDWR); + gfx.fd_ion = open("/dev/ion", O_RDWR); + gfx.fd_mem = open("/dev/mem", O_RDWR); + gfx.de_mem = mmap(0, DE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, gfx.fd_mem, DE); - gfx.fb0_fd = open("/dev/fb0", O_RDWR); + ioctl(gfx.fd_fb, FBIOGET_FSCREENINFO, &gfx.finfo); + ioctl(gfx.fd_fb, FBIOGET_VSCREENINFO, &gfx.vinfo); - ioctl(gfx.fb0_fd, FBIOGET_FSCREENINFO, &gfx.finfo); - ioctl(gfx.fb0_fd, FBIOGET_VSCREENINFO, &gfx.vinfo); - gfx.vinfo.bits_per_pixel = FIXED_DEPTH; - gfx.vinfo.xres = FIXED_WIDTH; - gfx.vinfo.yres = FIXED_HEIGHT; - gfx.vinfo.xres_virtual = VIRTUAL_WIDTH; - gfx.vinfo.yres_virtual = VIRTUAL_HEIGHT; - gfx.vinfo.xoffset = 0; - gfx.vinfo.yoffset = 0; - if (ioctl(gfx.fb0_fd, FBIOPUT_VSCREENINFO, &gfx.vinfo)) LOG_info("FBIOPUT_VSCREENINFO failed %s\n", strerror(errno)); + gfx.page = 1; + gfx.width = FIXED_WIDTH; + gfx.height = FIXED_HEIGHT; + gfx.pitch = FIXED_PITCH; - // printf("bits_per_pixel: %i\n", gfx.vinfo.bits_per_pixel); - // printf("xres: %i\n", gfx.vinfo.xres); - // printf("yres: %i\n", gfx.vinfo.yres); - // printf("xres_virtual: %i\n", gfx.vinfo.xres_virtual); - // printf("yres_virtual: %i\n", gfx.vinfo.yres_virtual); - // printf("xoffset: %i\n", gfx.vinfo.xoffset); - // printf("yoffset: %i\n", gfx.vinfo.yoffset); - // printf("activate: %i\n", gfx.vinfo.activate); - // printf("vmode: %i\n", gfx.vinfo.vmode); - // printf("sync: %i\n", gfx.vinfo.sync); - // fflush(stdout); + gfx.fb_info.size = PAGE_SIZE * PAGE_COUNT; + ion_alloc(gfx.fd_ion, &gfx.fb_info); + + gfx.screen = SDL_CreateRGBSurfaceFrom(gfx.fb_info.vadd+PAGE_SIZE, gfx.width,gfx.height,FIXED_DEPTH,gfx.pitch, RGBA_MASK_AUTO); + memset(gfx.screen->pixels, 0, gfx.pitch * gfx.height); struct owlfb_sync_info sinfo; sinfo.enabled = 1; - if (ioctl(gfx.fb0_fd, OWLFB_VSYNC_EVENT_EN, &sinfo)) LOG_info("OWLFB_VSYNC_EVENT_EN failed %s\n", strerror(errno)); + sinfo.disp_id = 2; + if (ioctl(gfx.fd_fb, OWLFB_VSYNC_EVENT_EN, &sinfo)<0) fprintf(stderr, "VSYNC_EVENT_EN failed %s\n",strerror(errno)); - gfx.page = 1; // start on the backbuffer - gfx.fb0_buffer = mmap(0, gfx.finfo.smem_len, PROT_READ | PROT_WRITE, MAP_SHARED, gfx.fb0_fd, 0); - memset(gfx.fb0_buffer, 0, VIRTUAL_SIZE); // clear both buffers + int vw = (gfx.de_mem[DE_PATH_SIZE(0)/4]&0xFFFF)+1; + int vh = (gfx.de_mem[DE_PATH_SIZE(0)/4]>>16)+1; - gfx.screen = SDL_CreateRGBSurfaceFrom(gfx.fb0_buffer + (gfx.page * PAGE_SIZE), FIXED_WIDTH,FIXED_HEIGHT, FIXED_DEPTH,FIXED_PITCH, 0,0,0,0); + gfx.de_mem[DE_OVL_ISIZE(0)/4] = gfx.de_mem[DE_OVL_ISIZE(2)/4] = ((gfx.width-1) & 0xFFFF) | ((gfx.height-1) << 16); + gfx.de_mem[DE_OVL_SR(0)/4] = gfx.de_mem[DE_OVL_SR(2)/4] = ((0x2000*gfx.width/vw)&0xFFFF) | ((0x2000*gfx.height/vh)<<16); + gfx.de_mem[DE_OVL_STR(0)/4] = gfx.de_mem[DE_OVL_STR(2)/4] = gfx.pitch / 8; + gfx.de_mem[DE_OVL_BA0(0)/4] = (uintptr_t)(gfx.fb_info.padd + PAGE_SIZE); - ////////////////////////////// + GFX_setNearestNeighbor(0); + + //////// gfx.vsync = VSYNC_STRICT; gfx.mode = mode; @@ -306,6 +433,7 @@ SDL_Surface* GFX_init(int mode) { sprintf(asset_path, RES_PATH "/assets@%ix.png", SCREEN_SCALE); gfx.assets = IMG_Load(asset_path); + TTF_Init(); font.large = TTF_OpenFont(FONT_PATH, SCALE1(FONT_LARGE)); font.medium = TTF_OpenFont(FONT_PATH, SCALE1(FONT_MEDIUM)); font.small = TTF_OpenFont(FONT_PATH, SCALE1(FONT_SMALL)); @@ -322,33 +450,24 @@ void GFX_quit(void) { SDL_FreeSurface(gfx.assets); - ioctl(gfx.fb0_fd, OWLFB_WAITFORVSYNC, &_); - GFX_clearAll(); - munmap(gfx.fb0_buffer, VIRTUAL_SIZE); - // restore for other binaries - gfx.vinfo.bits_per_pixel = FIXED_DEPTH; - gfx.vinfo.xres = FIXED_WIDTH; - gfx.vinfo.yres = FIXED_HEIGHT; - gfx.vinfo.xres_virtual = FIXED_WIDTH; - gfx.vinfo.yres_virtual = FIXED_HEIGHT; - gfx.vinfo.xoffset = 0; - gfx.vinfo.yoffset = 0; - if (ioctl(gfx.fb0_fd, FBIOPUT_VSCREENINFO, &gfx.vinfo)) LOG_info("FBIOPUT_VSCREENINFO failed %s\n", strerror(errno)); - - close(gfx.fb0_fd); + ion_free(gfx.fd_ion, &gfx.fb_info); + munmap(gfx.de_mem, DE_SIZE); + close(gfx.fd_mem); + close(gfx.fd_ion); + close(gfx.fd_fb); + SDL_FreeSurface(gfx.screen); SDL_Quit(); } void GFX_clear(SDL_Surface* screen) { - memset(screen->pixels, 0, PAGE_SIZE); // this buffer is offscreen when cleared + // this buffer is offscreen when cleared + memset(screen->pixels, 0, PAGE_SIZE); } void GFX_clearAll(void) { - // TODO: one of the buffers is onscreen when cleared producing tearing - // so clear our working buffer immediately (screen->pixels) - // then set a flag and clear the other two after vsync? - memset(gfx.fb0_buffer, 0, VIRTUAL_SIZE); + // TODO: one buffer is onscreen when cleared producing tearing + memset(gfx.fb_info.vadd, 0, PAGE_SIZE * PAGE_COUNT); } void GFX_setMode(int mode) { @@ -367,62 +486,72 @@ void GFX_startFrame(void) { frame_start = SDL_GetTicks(); } SDL_Surface* GFX_resize(int w, int h, int pitch) { - LOG_info("resize: %ix%i (%i)\n", w,h, pitch); - // callee should decide if resizing is actually necessary + gfx.width = w; + gfx.height = h; + gfx.pitch = pitch; + + SDL_FreeSurface(gfx.screen); + gfx.screen = SDL_CreateRGBSurfaceFrom(gfx.fb_info.vadd + gfx.page*PAGE_SIZE, gfx.width,gfx.height,FIXED_DEPTH,gfx.pitch, RGBA_MASK_AUTO); + memset(gfx.screen->pixels, 0, gfx.pitch * gfx.height); - if (gfx.screen) SDL_FreeSurface(gfx.screen); + int vw = (gfx.de_mem[DE_PATH_SIZE(0)/4]&0xFFFF)+1; + int vh = (gfx.de_mem[DE_PATH_SIZE(0)/4]>>16)+1; - gfx.screen = SDL_CreateRGBSurfaceFrom(gfx.fb0_buffer + (gfx.page * PAGE_SIZE), w,h, FIXED_DEPTH,pitch, 0,0,0,0); - memset(gfx.screen->pixels, 0, PAGE_SIZE); - - gfx.vinfo.xres = w; - gfx.vinfo.yres = h; - - // triggers FBIOPUT_VSCREENINFO instead - // of FBIOPAN_DISPLAY in GFX_flip() - gfx.resized = 1; + gfx.de_mem[DE_OVL_ISIZE(0)/4] = gfx.de_mem[DE_OVL_ISIZE(2)/4] = ((gfx.width-1) & 0xFFFF) | ((gfx.height-1) << 16); + gfx.de_mem[DE_OVL_SR(0)/4] = gfx.de_mem[DE_OVL_SR(2)/4] = ((0x2000*gfx.width/vw)&0xFFFF) | ((0x2000*gfx.height/vh)<<16); + gfx.de_mem[DE_OVL_STR(0)/4] = gfx.de_mem[DE_OVL_STR(2)/4] = gfx.pitch / 8; + gfx.de_mem[DE_OVL_BA0(0)/4] = (uintptr_t)(gfx.fb_info.padd + gfx.page * PAGE_SIZE); return gfx.screen; } +void GFX_setScaleClip(int x, int y, int width, int height) { + DE_setRect(gfx.de_mem, x,y,width,height); +} +void GFX_setNearestNeighbor(int enabled) { + int scale_coef = enabled ? DE_SCOEF_NONE : DE_SCOEF_HALF_ZOOMOUT; + DE_setScaleCoef(gfx.de_mem, 0, scale_coef); + DE_setScaleCoef(gfx.de_mem, 1, scale_coef); + DE_setScaleCoef(gfx.de_mem, 2, scale_coef); + DE_setScaleCoef(gfx.de_mem, 3, scale_coef); +} int GFX_autosize(SDL_Surface** screen, int* dirty) { - static int had_hdmi = -1; - int has_hdmi = GetHDMI(); - if (had_hdmi==has_hdmi) return 0; + // TODO: remove this entirely? + return 0; - *dirty = 1; - if (has_hdmi) *screen = GFX_resize(HDMI_MENU_WIDTH,FIXED_HEIGHT,HDMI_MENU_WIDTH*FIXED_BPP); - else *screen = GFX_resize(FIXED_WIDTH,FIXED_HEIGHT,FIXED_PITCH); - had_hdmi = has_hdmi; - - return 1; + // static int had_hdmi = -1; + // int has_hdmi = GetHDMI(); + // if (had_hdmi==has_hdmi) return 0; + // + // *dirty = 1; + // if (has_hdmi) *screen = GFX_resize(HDMI_MENU_WIDTH,FIXED_HEIGHT,HDMI_MENU_WIDTH*FIXED_BPP); + // else *screen = GFX_resize(FIXED_WIDTH,FIXED_HEIGHT,FIXED_PITCH); + // had_hdmi = has_hdmi; + // + // return 1; } static void POW_flipOverlay(void); void GFX_flip(SDL_Surface* screen) { - // point framebuffer at the first line of the backbuffer - gfx.vinfo.yoffset = gfx.page * PAGE_HEIGHT; - if (ioctl(gfx.fb0_fd, gfx.resized ? FBIOPUT_VSCREENINFO : FBIOPAN_DISPLAY, &gfx.vinfo)) LOG_info("%s failed %s\n", (gfx.resized ? "FBIOPUT_VSCREENINFO" : "FBIOPAN_DISPLAY"), strerror(errno)); - gfx.resized = 0; + gfx.de_mem[DE_OVL_BA0(0)/4] = gfx.de_mem[DE_OVL_BA0(2)/4] = (uintptr_t)(gfx.fb_info.padd + gfx.page * PAGE_SIZE); + DE_enableLayer(gfx.de_mem); POW_flipOverlay(); if (gfx.vsync!=VSYNC_OFF) { // this limiting condition helps SuperFX chip games if (gfx.vsync==VSYNC_STRICT || frame_start==0 || SDL_GetTicks()-frame_start=PAGE_COUNT) gfx.page -= PAGE_COUNT; - gfx.screen->pixels = gfx.fb0_buffer + (gfx.page * PAGE_SIZE); - + gfx.page ^= 1; + gfx.screen->pixels = gfx.fb_info.vadd + gfx.page * PAGE_SIZE; } void GFX_sync(void) { if (gfx.vsync!=VSYNC_OFF) { // this limiting condition helps SuperFX chip games if (gfx.vsync==VSYNC_STRICT || frame_start==0 || SDL_GetTicks()-frame_startw,gfx.screen->h,FIXED_DEPTH,0,0,0,0); - SDL_BlitSurface(gfx.screen, NULL, copy, NULL); + SDL_BlitSurface(gfx.screen, NULL, copy, NULL); // TODO: this is just copying screen! :facepalm: return copy; } int GFX_truncateText(TTF_Font* font, const char* in_name, char* out_name, int max_width, int padding) { @@ -454,8 +583,6 @@ int GFX_truncateText(TTF_Font* font, const char* in_name, char* out_name, int ma return text_width; } int GFX_wrapText(TTF_Font* font, char* str, int max_width, int max_lines) { - // TODO: this is "kinda" a buggy mess...but possibly fixed now! - if (!str) return 0; int line_width; @@ -1224,62 +1351,62 @@ int VIB_getStrength(void) { #define OVERLAY_FB 0 #define OVERLAY_ID 1 static void POW_initOverlay(void) { - // setup surface - pow.overlay = SDL_CreateRGBSurfaceFrom(NULL,SCALE2(OVERLAY_WIDTH,OVERLAY_HEIGHT),OVERLAY_DEPTH,SCALE1(OVERLAY_PITCH), OVERLAY_RGBA_MASK); - uint32_t size = pow.overlay->h * pow.overlay->pitch; - uint32_t offset = (gfx.finfo.smem_len - size)&(~4095); - pow.overlay->pixels = gfx.fb0_buffer + offset; - - // draw battery - SDL_SetAlpha(gfx.assets, 0,0); - GFX_blitAsset(ASSET_BLACK_PILL, NULL, pow.overlay, NULL); - SDL_SetAlpha(gfx.assets, SDL_SRCALPHA,0); - GFX_blitBattery(pow.overlay, NULL); - - // setup overlay - memset(&pow.oargs, 0, sizeof(struct owlfb_overlay_args)); - pow.oargs.fb_id = OVERLAY_FB; - pow.oargs.overlay_id = OVERLAY_ID; - pow.oargs.overlay_type = OWLFB_OVERLAY_VIDEO; - pow.oargs.uintptr_overly_info = (uintptr_t)&pow.oinfo; - - int x,y,w,h; - w = h = pow.overlay->w; - x = SCREEN_WIDTH - SCALE1(PADDING) - w; - y = SCALE1(PADDING); - - pow.oinfo.mem_off = offset; - pow.oinfo.mem_size = size; - pow.oinfo.screen_width = VIRTUAL_WIDTH; // ??? - pow.oinfo.color_mode = OWL_DSS_COLOR_ARGB32; - pow.oinfo.img_width = w; - pow.oinfo.img_height = h; - pow.oinfo.xoff = 0; - pow.oinfo.yoff = 0; - pow.oinfo.width = w; - pow.oinfo.height = h; - pow.oinfo.rotation = 0; - pow.oinfo.pos_x = x; // position - pow.oinfo.pos_y = y; // - pow.oinfo.out_width = w; // scaled size - pow.oinfo.out_height = h; // - pow.oinfo.global_alpha_en = 0; - pow.oinfo.global_alpha = 0; - pow.oinfo.pre_mult_alpha_en = 0; - pow.oinfo.zorder = 3; + // // setup surface + // pow.overlay = SDL_CreateRGBSurfaceFrom(NULL,SCALE2(OVERLAY_WIDTH,OVERLAY_HEIGHT),OVERLAY_DEPTH,SCALE1(OVERLAY_PITCH), OVERLAY_RGBA_MASK); + // uint32_t size = pow.overlay->h * pow.overlay->pitch; + // uint32_t offset = (gfx.finfo.smem_len - size)&(~4095); + // pow.overlay->pixels = gfx.fb0_buffer + offset; + // + // // draw battery + // SDL_SetAlpha(gfx.assets, 0,0); + // GFX_blitAsset(ASSET_BLACK_PILL, NULL, pow.overlay, NULL); + // SDL_SetAlpha(gfx.assets, SDL_SRCALPHA,0); + // GFX_blitBattery(pow.overlay, NULL); + // + // // setup overlay + // memset(&pow.oargs, 0, sizeof(struct owlfb_overlay_args)); + // pow.oargs.fb_id = OVERLAY_FB; + // pow.oargs.overlay_id = OVERLAY_ID; + // pow.oargs.overlay_type = OWLFB_OVERLAY_VIDEO; + // pow.oargs.uintptr_overly_info = (uintptr_t)&pow.oinfo; + // + // int x,y,w,h; + // w = h = pow.overlay->w; + // x = SCREEN_WIDTH - SCALE1(PADDING) - w; + // y = SCALE1(PADDING); + // + // pow.oinfo.mem_off = offset; + // pow.oinfo.mem_size = size; + // pow.oinfo.screen_width = VIRTUAL_WIDTH; // ??? + // pow.oinfo.color_mode = OWL_DSS_COLOR_ARGB32; + // pow.oinfo.img_width = w; + // pow.oinfo.img_height = h; + // pow.oinfo.xoff = 0; + // pow.oinfo.yoff = 0; + // pow.oinfo.width = w; + // pow.oinfo.height = h; + // pow.oinfo.rotation = 0; + // pow.oinfo.pos_x = x; // position + // pow.oinfo.pos_y = y; // + // pow.oinfo.out_width = w; // scaled size + // pow.oinfo.out_height = h; // + // pow.oinfo.global_alpha_en = 0; + // pow.oinfo.global_alpha = 0; + // pow.oinfo.pre_mult_alpha_en = 0; + // pow.oinfo.zorder = 3; } static void POW_flipOverlay(void) { - if (pow.should_warn && pow.charge<=POW_LOW_CHARGE) ioctl(gfx.fb0_fd, OWLFB_OVERLAY_SETINFO, &pow.oargs); + // if (pow.should_warn && pow.charge<=POW_LOW_CHARGE) ioctl(gfx.fb0_fd, OWLFB_OVERLAY_SETINFO, &pow.oargs); } static void POW_quitOverlay(void) { - if (pow.overlay) SDL_FreeSurface(pow.overlay); - - memset(&pow.oargs, 0, sizeof(struct owlfb_overlay_args)); - pow.oargs.fb_id = OVERLAY_FB; - pow.oargs.overlay_id = OVERLAY_ID; - pow.oargs.overlay_type = OWLFB_OVERLAY_VIDEO; - pow.oargs.uintptr_overly_info = 0; - ioctl(gfx.fb0_fd, OWLFB_OVERLAY_DISABLE, &pow.oargs); + // if (pow.overlay) SDL_FreeSurface(pow.overlay); + // + // memset(&pow.oargs, 0, sizeof(struct owlfb_overlay_args)); + // pow.oargs.fb_id = OVERLAY_FB; + // pow.oargs.overlay_id = OVERLAY_ID; + // pow.oargs.overlay_type = OWLFB_OVERLAY_VIDEO; + // pow.oargs.uintptr_overly_info = 0; + // ioctl(gfx.fb0_fd, OWLFB_OVERLAY_DISABLE, &pow.oargs); } static void POW_updateBatteryStatus(void) { diff --git a/src/common/api.h b/src/common/api.h index 3cb629e..77e38a3 100644 --- a/src/common/api.h +++ b/src/common/api.h @@ -20,12 +20,12 @@ void LOG_note(int level, const char* fmt, ...); /////////////////////////////// -#define FIXED_WIDTH 640 -#define FIXED_HEIGHT 480 -#define FIXED_BPP 2 -#define FIXED_DEPTH FIXED_BPP * 8 -#define FIXED_PITCH FIXED_WIDTH * FIXED_BPP -#define FIXED_SIZE FIXED_HEIGHT * FIXED_PITCH +#define FIXED_WIDTH 640 +#define FIXED_HEIGHT 480 +#define FIXED_BPP 2 +#define FIXED_DEPTH (FIXED_BPP * 8) +#define FIXED_PITCH (FIXED_WIDTH * FIXED_BPP) +#define FIXED_SIZE (FIXED_PITCH * FIXED_HEIGHT) #define HDMI_WIDTH 1280 #define HDMI_HEIGHT 720 @@ -33,17 +33,18 @@ void LOG_note(int level, const char* fmt, ...); #define HDMI_SIZE HDMI_HEIGHT * HDMI_PITCH #define HDMI_MENU_WIDTH 856 // FIXED_WIDTH * FIXED_HEIGHT / HDMI_HEIGHT rounded up to nearest 8 -#define PAGE_COUNT 2 -#define PAGE_SCALE 2 -#define PAGE_WIDTH FIXED_WIDTH * PAGE_SCALE -#define PAGE_HEIGHT FIXED_HEIGHT * PAGE_SCALE -#define PAGE_PITCH PAGE_WIDTH * FIXED_BPP -#define PAGE_SIZE PAGE_HEIGHT * PAGE_PITCH +#define PAGE_COUNT 2 +#define PAGE_SCALE 3 +#define PAGE_WIDTH (FIXED_WIDTH * PAGE_SCALE) +#define PAGE_HEIGHT (FIXED_HEIGHT * PAGE_SCALE) +#define PAGE_PITCH (PAGE_WIDTH * FIXED_BPP) +#define PAGE_SIZE (PAGE_PITCH * PAGE_HEIGHT) -#define VIRTUAL_WIDTH PAGE_WIDTH -#define VIRTUAL_HEIGHT PAGE_HEIGHT * PAGE_COUNT -#define VIRTUAL_PITCH PAGE_WIDTH * FIXED_BPP -#define VIRTUAL_SIZE VIRTUAL_HEIGHT * VIRTUAL_PITCH +/////////////////////////////// + +#define RGBA_MASK_AUTO 0x0, 0x0, 0x0, 0x0 +#define RGBA_MASK_565 0xF800, 0x07E0, 0x001F, 0x0000 +#define RGBA_MASK_8888 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 /////////////////////////////// @@ -98,6 +99,8 @@ enum { SDL_Surface* GFX_init(int mode); SDL_Surface* GFX_resize(int width, int height, int pitch); +void GFX_setScaleClip(int x, int y, int width, int height); +void GFX_setNearestNeighbor(int enabled); int GFX_autosize(SDL_Surface** screen, int* dirty); void GFX_setMode(int mode); void GFX_clear(SDL_Surface* screen); diff --git a/src/common/de_atm7059.h b/src/common/de_atm7059.h new file mode 100755 index 0000000..ca06ab5 --- /dev/null +++ b/src/common/de_atm7059.h @@ -0,0 +1,154 @@ +/* + * linux/drivers/video/owl/dss/de_atm7059.h + * + * NOTE: SHOULD only be included by de_atm7059.c + * + * Copyright (C) 2014 Actions + * Author: lipeng + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef _DE_ATM7059_H_ +#define _DE_ATM7059_H_ + +//#include "de.h" + +/*================================================================ + * Definition of registers and bit position + *==============================================================*/ + + +/* please fixme */ +#define DE_SIZE_BIT_WIDTH 12 + +/* + * DE common registers + */ +#define DE_IRQSTATUS 0x0004 +#define DE_IRQENABLE 0x0000 +#define DE_IF_CON 0x000c + +#define DE_MMU_EN 0x0010 +#define DE_MMU_BASE 0x0014 + +#define DE_OUTPUT_CON 0x1000 +#define DE_OUTPUT_STAT 0x100c +#define DE_WB_CON 0x1004 +#define DE_WB_ADDR 0x1008 + +/* dither for path0, only for ATM7059A */ +#define DE_PATH_DITHER 0x150 + +/* + * dehw manager/channel specific registers + */ +#define DE_PATH_BASE 0x0100 +#define DE_PATH_CTL(n) (DE_PATH_BASE + (n) * 0x100 + 0x0000) + +#define DE_PATH_EN(n) DE_PATH_CTL(n) +#define DE_PATH_ENABLE_BIT 28 + +#define DE_PATH_FCR(n) DE_PATH_CTL(n) +#define DE_PATH_BK(n) (DE_PATH_BASE + (n) * 0x100 + 0x0020) +#define DE_PATH_SIZE(n) (DE_PATH_BASE + (n) * 0x100 + 0x0024) +#define DE_PATH_E_COOR(n) (DE_PATH_BASE + (n) * 0x100 + 0x0028) + +#define DE_PATH_GAMMA_IDX(n) (DE_PATH_BASE + (n) * 0x100 + 0x002C) +#define DE_PATH_GAMMA_IDX_BUSY_BIT (14) +#define DE_PATH_GAMMA_IDX_OP_SEL_BEGIN_BIT (12) +#define DE_PATH_GAMMA_IDX_OP_SEL_END_BIT (13) +#define DE_PATH_GAMMA_IDX_INDEX_BEGIN_BIT (0) +#define DE_PATH_GAMMA_IDX_INDEX_END_BIT (7) + +#define DE_PATH_GAMMA_RAM(n) (DE_PATH_BASE + (n) * 0x100 + 0x0030) + +#define DE_PATH_CURSOR_FB(n) (DE_PATH_BASE + (n) * 0x100 + 0x0034) +#define DE_PATH_CURSOR_STR(n) (DE_PATH_BASE + (n) * 0x100 + 0x0038) + +/* DE overlay registers */ +#define DE_OVL_BASE 0x0400 +#define DE_OVL_CFG(n) (DE_OVL_BASE + (n) * 0x100 + 0x0000) +#define DE_OVL_ISIZE(n) (DE_OVL_BASE + (n) * 0x100 + 0x0004) +#define DE_OVL_OSIZE(n) (DE_OVL_BASE + (n) * 0x100 + 0x0008) +#define DE_OVL_SR(n) (DE_OVL_BASE + (n) * 0x100 + 0x000c) +#define DE_OVL_SCOEF0(n) (DE_OVL_BASE + (n) * 0x100 + 0x0010) +#define DE_OVL_SCOEF1(n) (DE_OVL_BASE + (n) * 0x100 + 0x0014) +#define DE_OVL_SCOEF2(n) (DE_OVL_BASE + (n) * 0x100 + 0x0018) +#define DE_OVL_SCOEF3(n) (DE_OVL_BASE + (n) * 0x100 + 0x001c) +#define DE_OVL_SCOEF4(n) (DE_OVL_BASE + (n) * 0x100 + 0x0020) +#define DE_OVL_SCOEF5(n) (DE_OVL_BASE + (n) * 0x100 + 0x0024) +#define DE_OVL_SCOEF6(n) (DE_OVL_BASE + (n) * 0x100 + 0x0028) +#define DE_OVL_SCOEF7(n) (DE_OVL_BASE + (n) * 0x100 + 0x002c) +#define DE_OVL_BA0(n) (DE_OVL_BASE + (n) * 0x100 + 0x0030) +#define DE_OVL_BA1UV(n) (DE_OVL_BASE + (n) * 0x100 + 0x0034) +#define DE_OVL_BA2V(n) (DE_OVL_BASE + (n) * 0x100 + 0x0038) +#define DE_OVL_3D_RIGHT_BA0(n) (DE_OVL_BASE + (n) * 0x100 + 0x003C) +#define DE_OVL_3D_RIGHT_BA1UV(n) (DE_OVL_BASE + (n) * 0x100 + 0x0040) +#define DE_OVL_3D_RIGHT_BA2V(n) (DE_OVL_BASE + (n) * 0x100 + 0x0044) +#define DE_OVL_STR(n) (DE_OVL_BASE + (n) * 0x100 + 0x0048) +#define DE_OVL_CRITICAL_CFG(n) (DE_OVL_BASE + (n) * 0x100 + 0x004c) +#define DE_OVL_REMAPPING(n) (DE_OVL_BASE + (n) * 0x100 + 0x0050) +#define DE_OVL_CKMAX(n) (DE_OVL_BASE + (n) * 0x100 + 0x005c) +#define DE_OVL_CKMIN(n) (DE_OVL_BASE + (n) * 0x100 + 0x0060) +#define DE_OVL_BLEND(n) (DE_OVL_BASE + (n) * 0x100 + 0x0064) + +#define DE_OVL_COOR(m, n) (DE_OVL_BASE + (n) * 0x100 + 0x0054) +#define DE_OVL_ALPHA_CFG(m, n) (DE_OVL_BASE + (n) * 0x100 + 0x0058) +#define DE_OVL_ALPHA_ENABLE(m, n) DE_OVL_BLEND(n) +#define DE_PATH_GAMMA_ENABLE(n) DE_PATH_CTL(n) +#define DE_PATH_GAMMA_ENABLE_BIT (9) + +#define DE_OVL_CSC(n) DE_OVL_CFG(n) +#define DE_OVL_CSC_CON_BEGIN_BIT 4 +#define DE_OVL_CSC_CON_END_BIT 7 +#define DE_OVL_CSC_STA_BEGIN_BIT 8 +#define DE_OVL_CSC_STA_END_BIT 11 +#define DE_OVL_CSC_BRI_BEGIN_BIT 12 +#define DE_OVL_CSC_BRI_END_BIT 19 +#define DE_OVL_CSC_BYPASS_BIT 0 + +#define DE_OVL_CFG_FLIP_BIT 20 +#define DE_OVL_CFG_FMT_BEGIN_BIT 0 +#define DE_OVL_CFG_FMT_END_BIT 2 +#define DE_OVL_CFG_BYPASS_BIT 3 +#define DE_OVL_CFG_CONTRAST_BEGIN_BIT 4 +#define DE_OVL_CFG_CONTRAST_END_BIT 7 +#define DE_OVL_CFG_SATURATION_BEGIN_BIT 8 +#define DE_OVL_CFG_SATURATION_END_BIT 11 +#define DE_OVL_CFG_LIGHTNESS_BEGIN_BIT 12 +#define DE_OVL_CFG_LIGHTNESS_END_BIT 19 +#define DE_OVL_CFG_CRITICAL_CTL_BEGIN_BIT 26 +#define DE_OVL_CFG_CRITICAL_CTL_END_BIT 27 + +#define DE_OVL_ALPHA_CFG_PRE_MUTI_BIT 8 +#define DE_OVL_ALPHA_CFG_VALUE_BEGIN_BIT 0 +#define DE_OVL_ALPHA_CFG_VALUE_END_BIT 7 +#define DE_OVL_ALPHA_CFG_ENABLE_BEGIN_BIT 0 +#define DE_OVL_ALPHA_CFG_ENABLE_END_BIT 0 + +#define DE_OUTPUT_PATH1_DEVICE_BEGIN_BIT 0 +#define DE_OUTPUT_PATH1_DEVICE_END_BIT 2 +#define DE_OUTPUT_PATH2_DEVICE_BEGIN_BIT 4 +#define DE_OUTPUT_PATH2_DEVICE_END_BIT 6 + +#define DE_PATH_CTL_IYUV_QEN_BIT 16 +#define DE_PATH_CTL_YUV_FMT_BIT 15 +#define DE_PATH_CTL_ILACE_BIT 11 +#define DE_PATH_CTL_GAMMA_ENABLE_BIT 9 + +#define DE_PANEL_ENABLE_BIT 20 +#define DE_PANEL_CURSOR_ENABLE_BIT 24 +#define DE_PATH_FCR_BIT 29 + +#endif diff --git a/src/common/defines.h b/src/common/defines.h index f86d8f4..072adc1 100644 --- a/src/common/defines.h +++ b/src/common/defines.h @@ -20,6 +20,25 @@ #define CODE_MINUS 0x6D #define CODE_POWER 0x74 +#define BUTTON_UP SDLK_KATAKANA +#define BUTTON_RIGHT SDLK_KATAKANAHIRAGANA +#define BUTTON_DOWN SDLK_HIRAGANA +#define BUTTON_LEFT SDLK_HENKAN +#define BUTTON_A SDLK_MUHENKAN +#define BUTTON_B SDLK_KP_JPCOMMA +#define BUTTON_X SDLK_KP_ENTER +#define BUTTON_Y SDLK_RCTRL +#define BUTTON_L1 SDLK_RALT +#define BUTTON_L2 SDLK_HOME +#define BUTTON_R1 SDLK_BREAK +#define BUTTON_R2 SDLK_UP +#define BUTTON_SELECT SDLK_PRINT +#define BUTTON_START SDLK_KP_DIVIDE +#define BUTTON_MENU SDLK_PAGEUP +#define BUTTON_PLUS SDLK_DOWN +#define BUTTON_MINUS SDLK_PAGEDOWN +#define BUTTON_POWER SDLK_UNKNOWN + #define VOLUME_MIN 0 #define VOLUME_MAX 20 #define BRIGHTNESS_MIN 0 diff --git a/src/common/ion-owl.h b/src/common/ion-owl.h new file mode 100755 index 0000000..cbe732e --- /dev/null +++ b/src/common/ion-owl.h @@ -0,0 +1,51 @@ +/* + * include/linux/ion-owl.h + * + * Copyright 2012 Actions Semi Inc. + * Author: Actions Semi, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#if !defined(__KERNEL__) +#define __user +#endif + +#ifndef _UAPI_LINUX_ION_OWL_H +#define _UAPI_LINUX_ION_OWL_H + +#include + +/* for cmd OWL_ION_GET_PHY */ +struct owl_ion_phys_data { + ion_user_handle_t handle; + unsigned long phys_addr; + size_t size; +}; + +/* Custom Ioctl's. */ +enum { + OWL_ION_GET_PHY = 0, +}; + +/** + * These are the only ids that should be used for Ion heap ids. + * The ids listed are the order in which allocation will be attempted + * if specified. Don't swap the order of heap ids unless you know what + * you are doing! + * Id's are spaced by purpose to allow new Id's to be inserted in-between (for + * possible fallbacks) + */ + +enum ion_heap_ids { + ION_HEAP_ID_INVALID = -1, + ION_HEAP_ID_PMEM = 0, + ION_HEAP_ID_FB = 8, + ION_HEAP_ID_SYSTEM = 12, + ION_HEAP_ID_RESERVED = 31 /** Bit reserved for ION_SECURE flag */ +}; + +#endif /* _UAPI_LINUX_ION_OWL_H */ diff --git a/src/common/ion.h b/src/common/ion.h new file mode 100755 index 0000000..eafcc03 --- /dev/null +++ b/src/common/ion.h @@ -0,0 +1,240 @@ +/* + * drivers/staging/android/uapi/ion.h + * + * Copyright (C) 2011 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _UAPI_LINUX_ION_H +#define _UAPI_LINUX_ION_H + +#include +#include + +typedef int ion_user_handle_t; + +/** + * enum ion_heap_types - list of all possible types of heaps + * @ION_HEAP_TYPE_SYSTEM: memory allocated via vmalloc + * @ION_HEAP_TYPE_SYSTEM_CONTIG: memory allocated via kmalloc + * @ION_HEAP_TYPE_CARVEOUT: memory allocated from a prereserved + * carveout heap, allocations are physically + * contiguous + * @ION_HEAP_TYPE_DMA: memory allocated via DMA API + * @ION_NUM_HEAPS: helper for iterating over heaps, a bit mask + * is used to identify the heaps, so only 32 + * total heap types are supported + */ +enum ion_heap_type { + ION_HEAP_TYPE_SYSTEM, + ION_HEAP_TYPE_SYSTEM_CONTIG, + ION_HEAP_TYPE_CARVEOUT, + ION_HEAP_TYPE_CHUNK, + ION_HEAP_TYPE_DMA, + ION_HEAP_TYPE_CUSTOM, /* must be last so device specific heaps always + are at the end of this enum */ + ION_NUM_HEAPS = 16, +}; + +#define ION_HEAP_SYSTEM_MASK (1 << ION_HEAP_TYPE_SYSTEM) +#define ION_HEAP_SYSTEM_CONTIG_MASK (1 << ION_HEAP_TYPE_SYSTEM_CONTIG) +#define ION_HEAP_CARVEOUT_MASK (1 << ION_HEAP_TYPE_CARVEOUT) +#define ION_HEAP_TYPE_DMA_MASK (1 << ION_HEAP_TYPE_DMA) + +#define ION_NUM_HEAP_IDS sizeof(unsigned int) * 8 + +/** + * allocation flags - the lower 16 bits are used by core ion, the upper 16 + * bits are reserved for use by the heaps themselves. + */ +#define ION_FLAG_CACHED 1 /* mappings of this buffer should be + cached, ion will do cache + maintenance when the buffer is + mapped for dma */ +#define ION_FLAG_CACHED_NEEDS_SYNC 2 /* mappings of this buffer will created + at mmap time, if this is set + caches must be managed manually */ + +/** + * DOC: Ion Userspace API + * + * create a client by opening /dev/ion + * most operations handled via following ioctls + * + */ + +/** + * struct ion_allocation_data - metadata passed from userspace for allocations + * @len: size of the allocation + * @align: required alignment of the allocation + * @heap_id_mask: mask of heap ids to allocate from + * @flags: flags passed to heap + * @handle: pointer that will be populated with a cookie to use to + * refer to this allocation + * + * Provided by userspace as an argument to the ioctl + */ +struct ion_allocation_data { + size_t len; + size_t align; + unsigned int heap_id_mask; + unsigned int flags; + ion_user_handle_t handle; +}; + +/** + * struct ion_fd_data - metadata passed to/from userspace for a handle/fd pair + * @handle: a handle + * @fd: a file descriptor representing that handle + * + * For ION_IOC_SHARE or ION_IOC_MAP userspace populates the handle field with + * the handle returned from ion alloc, and the kernel returns the file + * descriptor to share or map in the fd field. For ION_IOC_IMPORT, userspace + * provides the file descriptor and the kernel returns the handle. + */ +struct ion_fd_data { + ion_user_handle_t handle; + int fd; +}; + +/** + * struct ion_handle_data - a handle passed to/from the kernel + * @handle: a handle + */ +struct ion_handle_data { + ion_user_handle_t handle; +}; + +/** + * struct ion_custom_data - metadata passed to/from userspace for a custom ioctl + * @cmd: the custom ioctl function to call + * @arg: additional data to pass to the custom ioctl, typically a user + * pointer to a predefined structure + * + * This works just like the regular cmd and arg fields of an ioctl. + */ +struct ion_custom_data { + unsigned int cmd; + unsigned long arg; +}; + +/** struct ion_flush_data - data passed to ion for flushing caches + * + * @handle: handle with data to flush + * @fd: fd to flush + * @vaddr: userspace virtual address mapped with mmap + * @offset: offset into the handle to flush + * @length: length of handle to flush + * + * Performs cache operations on the handle. If p is the start address + * of the handle, p + offset through p + offset + length will have + * the cache operations performed + */ +struct ion_flush_data { + void *handle; /* no used, kept for compatibility */ + int fd; + void *vaddr; + unsigned int offset; + unsigned int length; +}; + +#define ION_IOC_MAGIC 'I' + +/** + * DOC: ION_IOC_ALLOC - allocate memory + * + * Takes an ion_allocation_data struct and returns it with the handle field + * populated with the opaque handle for the allocation. + */ +#define ION_IOC_ALLOC _IOWR(ION_IOC_MAGIC, 0, \ + struct ion_allocation_data) + +/** + * DOC: ION_IOC_FREE - free memory + * + * Takes an ion_handle_data struct and frees the handle. + */ +#define ION_IOC_FREE _IOWR(ION_IOC_MAGIC, 1, struct ion_handle_data) + +/** + * DOC: ION_IOC_MAP - get a file descriptor to mmap + * + * Takes an ion_fd_data struct with the handle field populated with a valid + * opaque handle. Returns the struct with the fd field set to a file + * descriptor open in the current address space. This file descriptor + * can then be used as an argument to mmap. + */ +#define ION_IOC_MAP _IOWR(ION_IOC_MAGIC, 2, struct ion_fd_data) + +/** + * DOC: ION_IOC_SHARE - creates a file descriptor to use to share an allocation + * + * Takes an ion_fd_data struct with the handle field populated with a valid + * opaque handle. Returns the struct with the fd field set to a file + * descriptor open in the current address space. This file descriptor + * can then be passed to another process. The corresponding opaque handle can + * be retrieved via ION_IOC_IMPORT. + */ +#define ION_IOC_SHARE _IOWR(ION_IOC_MAGIC, 4, struct ion_fd_data) + +/** + * DOC: ION_IOC_IMPORT - imports a shared file descriptor + * + * Takes an ion_fd_data struct with the fd field populated with a valid file + * descriptor obtained from ION_IOC_SHARE and returns the struct with the handle + * filed set to the corresponding opaque handle. + */ +#define ION_IOC_IMPORT _IOWR(ION_IOC_MAGIC, 5, struct ion_fd_data) + +/** + * DOC: ION_IOC_SYNC - syncs a shared file descriptors to memory + * + * Deprecated in favor of using the dma_buf api's correctly (syncing + * will happend automatically when the buffer is mapped to a device). + * If necessary should be used after touching a cached buffer from the cpu, + * this will make the buffer in memory coherent. + */ +#define ION_IOC_SYNC _IOWR(ION_IOC_MAGIC, 7, struct ion_fd_data) + +/** + * DOC: ION_IOC_CUSTOM - call architecture specific ion ioctl + * + * Takes the argument of the architecture specific ioctl to call and + * passes appropriate userdata for that ioctl + */ +#define ION_IOC_CUSTOM _IOWR(ION_IOC_MAGIC, 6, struct ion_custom_data) + + +/** + * DOC: ION_IOC_CLEAN_CACHES - clean the caches + * + * Clean the caches of the handle specified. + */ +#define ION_IOC_CLEAN_CACHES _IOWR(ION_IOC_MAGIC, 20, \ + struct ion_flush_data) +/** + * DOC: ION_MSM_IOC_INV_CACHES - invalidate the caches + * + * Invalidate the caches of the handle specified. + */ +#define ION_IOC_INV_CACHES _IOWR(ION_IOC_MAGIC, 21, \ + struct ion_flush_data) +/** + * DOC: ION_MSM_IOC_CLEAN_CACHES - clean and invalidate the caches + * + * Clean and invalidate the caches of the handle specified. + */ +#define ION_IOC_CLEAN_INV_CACHES _IOWR(ION_IOC_MAGIC, 22, \ + struct ion_flush_data) + + +#endif /* _UAPI_LINUX_ION_H */ diff --git a/src/common/scaler_neon.c b/src/common/scaler_neon.c old mode 100755 new mode 100644 index 19c3803..7785061 --- a/src/common/scaler_neon.c +++ b/src/common/scaler_neon.c @@ -1,10 +1,9 @@ #include #include #include -#include "scaler_neon.h" // -// arm NEON / C integer scalers for miyoomini +// arm NEON / C integer scalers for ARMv7 devices // args/ src : src offset address of top left corner // dst : dst offset address of top left corner // sw : src width pixels @@ -18,12 +17,362 @@ // if odd#, then handled by the C scaler // +// +// C scalers +// +void scale1x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*1; } + if ((ymul == 1)&&(swl == sp)&&(sp == dp)) memcpy(dst, src, sp*sh); + else { + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + for (uint32_t i=ymul; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, src, swl); + } + } +} + +void scale1x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c16(src, dst, sw, sh, sp, dp, 1); } +void scale1x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c16(src, dst, sw, sh, sp, dp, 2); } +void scale1x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c16(src, dst, sw, sh, sp, dp, 3); } +void scale1x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c16(src, dst, sw, sh, sp, dp, 4); } + +void scale1x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*1; } + if ((ymul == 1)&&(swl == sp)&&(sp == dp)) memcpy(dst, src, sp*sh); + else { + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + for (uint32_t i=ymul; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, src, swl); + } + } +} + +void scale1x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c32(src, dst, sw, sh, sp, dp, 1); } +void scale1x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c32(src, dst, sw, sh, sp, dp, 2); } +void scale1x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c32(src, dst, sw, sh, sp, dp, 3); } +void scale1x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale1x_c32(src, dst, sw, sh, sp, dp, 4); } + +void scale2x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, dpix1, dpix2, swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } swl*=2; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x<(sw/2); x++, dx+=2) { + pix = s[x]; + dpix1=(pix & 0x0000FFFF)|(pix<<16); + dpix2=(pix & 0xFFFF0000)|(pix>>16); + d[dx] = dpix1; d[dx+1] = dpix2; + } + if (sw&1) { + uint16_t *s16 = (uint16_t*)s; + uint16_t pix16 = s16[x*2]; + d[dx] = pix16|(pix16<<16); + } + void* __restrict dstsrc = dst; dst = (uint8_t*)dst+dp; + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale2x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c16(src, dst, sw, sh, sp, dp, 1); } +void scale2x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c16(src, dst, sw, sh, sp, dp, 2); } +void scale2x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c16(src, dst, sw, sh, sp, dp, 3); } +void scale2x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c16(src, dst, sw, sh, sp, dp, 4); } + +void scale2x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } swl*=2; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale2x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c32(src, dst, sw, sh, sp, dp, 1); } +void scale2x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c32(src, dst, sw, sh, sp, dp, 2); } +void scale2x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c32(src, dst, sw, sh, sp, dp, 3); } +void scale2x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale2x_c32(src, dst, sw, sh, sp, dp, 4); } + +void scale3x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, dpix1, dpix2, swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } swl*=3; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x<(sw/2); x++, dx+=3) { + pix = s[x]; + dpix1=(pix & 0x0000FFFF)|(pix<<16); + dpix2=(pix & 0xFFFF0000)|(pix>>16); + d[dx] = dpix1; d[dx+1] = pix; d[dx+2] = dpix2; + } + if (sw&1) { + uint16_t *s16 = (uint16_t*)s; + uint16_t *d16 = (uint16_t*)d; + uint16_t pix16 = s16[x*2]; + dpix1 = pix16|(pix16<<16); + d[dx] = dpix1; d16[(dx+1)*2] = pix16; + } + void* __restrict dstsrc = dst; dst = (uint8_t*)dst+dp; + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale3x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c16(src, dst, sw, sh, sp, dp, 1); } +void scale3x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c16(src, dst, sw, sh, sp, dp, 2); } +void scale3x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c16(src, dst, sw, sh, sp, dp, 3); } +void scale3x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c16(src, dst, sw, sh, sp, dp, 4); } + +void scale3x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } swl*=3; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale3x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c32(src, dst, sw, sh, sp, dp, 1); } +void scale3x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c32(src, dst, sw, sh, sp, dp, 2); } +void scale3x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c32(src, dst, sw, sh, sp, dp, 3); } +void scale3x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale3x_c32(src, dst, sw, sh, sp, dp, 4); } + +void scale4x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, dpix1, dpix2, swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } swl*=4; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x<(sw/2); x++, dx+=4) { + pix = s[x]; + dpix1=(pix & 0x0000FFFF)|(pix<<16); + dpix2=(pix & 0xFFFF0000)|(pix>>16); + d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix2; d[dx+3] = dpix2; + } + if (sw&1) { + uint16_t *s16 = (uint16_t*)s; + uint16_t pix16 = s16[x*2]; + dpix1 = pix16|(pix16<<16); + d[dx] = dpix1; d[dx+1] = dpix1; + } + void* __restrict dstsrc = dst; dst = (uint8_t*)dst+dp; + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale4x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c16(src, dst, sw, sh, sp, dp, 1); } +void scale4x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c16(src, dst, sw, sh, sp, dp, 2); } +void scale4x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c16(src, dst, sw, sh, sp, dp, 3); } +void scale4x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c16(src, dst, sw, sh, sp, dp, 4); } + +void scale4x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } swl*=4; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale4x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c32(src, dst, sw, sh, sp, dp, 1); } +void scale4x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c32(src, dst, sw, sh, sp, dp, 2); } +void scale4x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c32(src, dst, sw, sh, sp, dp, 3); } +void scale4x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale4x_c32(src, dst, sw, sh, sp, dp, 4); } + +void scale5x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, dpix1, dpix2, swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } swl*=5; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x<(sw/2); x++, dx+=5) { + pix = s[x]; + dpix1=(pix & 0x0000FFFF)|(pix<<16); + dpix2=(pix & 0xFFFF0000)|(pix>>16); + d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = pix; d[dx+3] = dpix2; d[dx+4] = dpix2; + } + if (sw&1) { + uint16_t *s16 = (uint16_t*)s; + uint16_t *d16 = (uint16_t*)d; + uint16_t pix16 = s16[x*2]; + dpix1 = pix16|(pix16<<16); + d[dx] = dpix1; d[dx+1] = dpix1; d16[(dx+2)*2] = pix16; + } + void* __restrict dstsrc = dst; dst = (uint8_t*)dst+dp; + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale5x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c16(src, dst, sw, sh, sp, dp, 1); } +void scale5x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c16(src, dst, sw, sh, sp, dp, 2); } +void scale5x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c16(src, dst, sw, sh, sp, dp, 3); } +void scale5x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c16(src, dst, sw, sh, sp, dp, 4); } +void scale5x5_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c16(src, dst, sw, sh, sp, dp, 5); } + +void scale5x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } swl*=5; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale5x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c32(src, dst, sw, sh, sp, dp, 1); } +void scale5x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c32(src, dst, sw, sh, sp, dp, 2); } +void scale5x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c32(src, dst, sw, sh, sp, dp, 3); } +void scale5x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c32(src, dst, sw, sh, sp, dp, 4); } +void scale5x5_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_c32(src, dst, sw, sh, sp, dp, 5); } + +void scale6x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, dpix1, dpix2, swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } swl*=6; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x<(sw/2); x++, dx+=6) { + pix = s[x]; + dpix1=(pix & 0x0000FFFF)|(pix<<16); + dpix2=(pix & 0xFFFF0000)|(pix>>16); + d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix1; d[dx+3] = dpix2; d[dx+4] = dpix2; d[dx+5] = dpix2; + } + if (sw&1) { + uint16_t *s16 = (uint16_t*)s; + uint16_t pix16 = s16[x*2]; + dpix1 = pix16|(pix16<<16); + d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix1; + } + void* __restrict dstsrc = dst; dst = (uint8_t*)dst+dp; + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale6x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c16(src, dst, sw, sh, sp, dp, 1); } +void scale6x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c16(src, dst, sw, sh, sp, dp, 2); } +void scale6x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c16(src, dst, sw, sh, sp, dp, 3); } +void scale6x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c16(src, dst, sw, sh, sp, dp, 4); } +void scale6x5_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c16(src, dst, sw, sh, sp, dp, 5); } +void scale6x6_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c16(src, dst, sw, sh, sp, dp, 6); } + +void scale6x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; + uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } swl*=6; if (!dp) { dp = swl; } + for (; sh>0; sh--, src=(uint8_t*)src+sp) { + uint32_t *s = (uint32_t* __restrict)src; + uint32_t *d = (uint32_t* __restrict)dst; + for (x=dx=0; x0; i--, dst=(uint8_t*)dst+dp) memcpy(dst, dstsrc, swl); + } +} + +void scale6x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c32(src, dst, sw, sh, sp, dp, 1); } +void scale6x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c32(src, dst, sw, sh, sp, dp, 2); } +void scale6x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c32(src, dst, sw, sh, sp, dp, 3); } +void scale6x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c32(src, dst, sw, sh, sp, dp, 4); } +void scale6x5_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c32(src, dst, sw, sh, sp, dp, 5); } +void scale6x6_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_c32(src, dst, sw, sh, sp, dp, 6); } + +// // memcpy_neon (dst/src must be aligned 4, size must be aligned 2) -static inline void memcpy_neon(void* dst, void* src, uint32_t size) { +// +void memcpy_neon(void* dst, void* src, uint32_t size) { asm volatile ( - " bic r4, %2, #127 ;" - " add r3, %0, %2 ;" // r3 = endofs - " add r4, %0, r4 ;" // r4 = s128ofs + " bic r4, %[sz], #127 ;" + " add r3, %[s], %[sz] ;" // r3 = endofs + " add r4, %[s], r4 ;" // r4 = s128ofs " cmp %[s], r4 ;" " beq 2f ;" "1: vldmia %[s]!, {q8-q15} ;" // 128 bytes @@ -71,38 +420,469 @@ static inline void memcpy_neon(void* dst, void* src, uint32_t size) { // NEON scalers // -void scale1x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale1x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw*sizeof(uint16_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*1; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x_c16(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x1_c16(src,dst,sw,sh,sp,dp); return; } if ((swl == sp)&&(sp == dp)) memcpy_neon(dst, src, sp*sh); else for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp) memcpy_neon(dst, src, swl); } -void scale1x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale1x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x2_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl128 = swl & ~127; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x128bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q15} ;" // 128 bytes + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " cmp %0, lr ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 8f ;" + " tst %3, #64 ;" + " beq 4f ;" + " vldmia %0!, {q8-q11} ;" // 64 bytes + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "4: tst %3, #32 ;" + " beq 5f ;" + " vldmia %0!, {q12-q13} ;" // 32 bytes + " vstmia %1!, {q12-q13} ;" + " vstmia r9!, {q12-q13} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "5: tst %3, #16 ;" + " beq 6f ;" + " vldmia %0!, {q14} ;" // 16 bytes + " vstmia %1!, {q14} ;" + " vstmia r9!, {q14} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "6: tst %3, #8 ;" + " beq 7f ;" + " vldmia %0!, {d30} ;" // 8 bytes + " vstmia %1!, {d30} ;" + " vstmia r9!, {d30} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "7: ldr lr, [%0],#4 ;" // 4 bytes + " str lr, [%1],#4 ;" + " str lr, [r9] ;" + "8: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl128), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale1x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x3_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl128 = swl & ~127; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x128bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q15} ;" // 128 bytes + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " cmp %0, lr ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 8f ;" + " tst %3, #64 ;" + " beq 4f ;" + " vldmia %0!, {q8-q11} ;" // 64 bytes + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " vstmia r10!, {q8-q11} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "4: tst %3, #32 ;" + " beq 5f ;" + " vldmia %0!, {q12-q13} ;" // 32 bytes + " vstmia %1!, {q12-q13} ;" + " vstmia r9!, {q12-q13} ;" + " vstmia r10!, {q12-q13} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "5: tst %3, #16 ;" + " beq 6f ;" + " vldmia %0!, {q14} ;" // 16 bytes + " vstmia %1!, {q14} ;" + " vstmia r9!, {q14} ;" + " vstmia r10!, {q14} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "6: tst %3, #8 ;" + " beq 7f ;" + " vldmia %0!, {d30} ;" // 8 bytes + " vstmia %1!, {d30} ;" + " vstmia r9!, {d30} ;" + " vstmia r10!, {d30} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "7: ldr lr, [%0],#4 ;" // 4 bytes + " str lr, [%1],#4 ;" + " str lr, [r9] ;" + " str lr, [r10] ;" + "8: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl128), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale1x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw*sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x4_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl128 = swl & ~127; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*4 - swl; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x128bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " add r11, r10, %7 ;" // r11 = 4x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q15} ;" // 128 bytes + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " vstmia r11!, {q8-q15} ;" + " cmp %0, lr ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 8f ;" + " tst %3, #64 ;" + " beq 4f ;" + " vldmia %0!, {q8-q11} ;" // 64 bytes + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " vstmia r10!, {q8-q11} ;" + " vstmia r11!, {q8-q11} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "4: tst %3, #32 ;" + " beq 5f ;" + " vldmia %0!, {q12-q13} ;" // 32 bytes + " vstmia %1!, {q12-q13} ;" + " vstmia r9!, {q12-q13} ;" + " vstmia r10!, {q12-q13} ;" + " vstmia r11!, {q12-q13} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "5: tst %3, #16 ;" + " beq 6f ;" + " vldmia %0!, {q14} ;" // 16 bytes + " vstmia %1!, {q14} ;" + " vstmia r9!, {q14} ;" + " vstmia r10!, {q14} ;" + " vstmia r11!, {q14} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "6: tst %3, #8 ;" + " beq 7f ;" + " vldmia %0!, {d30} ;" // 8 bytes + " vstmia %1!, {d30} ;" + " vstmia r9!, {d30} ;" + " vstmia r10!, {d30} ;" + " vstmia r11!, {d30} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "7: ldr lr, [%0],#4 ;" // 4 bytes + " str lr, [%1],#4 ;" + " str lr, [r9] ;" + " str lr, [r10] ;" + " str lr, [r11] ;" + "8: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl128), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","r11","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale1x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale1x1_n16, &scale1x2_n16, &scale1x3_n16, &scale1x4_n16 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} +void scale1x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw*sizeof(uint32_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*1; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x_c32(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x1_c32(src,dst,sw,sh,sp,dp); return; } if ((swl == sp)&&(sp == dp)) memcpy_neon(dst, src, sp*sh); else for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp) memcpy_neon(dst, src, swl); } -void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale1x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x2_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl128 = swl & ~127; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x128bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q15} ;" // 128 bytes + " vstmia %1!, {q8-q15} ;" + " cmp %0, lr ;" + " vstmia r9!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 8f ;" + " tst %3, #64 ;" + " beq 4f ;" + " vldmia %0!, {q8-q11} ;" // 64 bytes + " vstmia %1!, {q8-q11} ;" + " cmp %0, r8 ;" + " vstmia r9!, {q8-q11} ;" + " beq 8f ;" + "4: tst %3, #32 ;" + " beq 5f ;" + " vldmia %0!, {q12-q13} ;" // 32 bytes + " vstmia %1!, {q12-q13} ;" + " cmp %0, r8 ;" + " vstmia r9!, {q12-q13} ;" + " beq 8f ;" + "5: tst %3, #16 ;" + " beq 6f ;" + " vldmia %0!, {q14} ;" // 16 bytes + " vstmia %1!, {q14} ;" + " cmp %0, r8 ;" + " vstmia r9!, {q14} ;" + " beq 8f ;" + "6: tst %3, #8 ;" + " beq 7f ;" + " vldmia %0!, {d30} ;" // 8 bytes + " vstmia %1!, {d30} ;" + " cmp %0, r8 ;" + " vstmia r9!, {d30} ;" + " beq 8f ;" + "7: ldr lr, [%0],#4 ;" // 4 bytes + " str lr, [%1],#4 ;" + " str lr, [r9] ;" + "8: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl128), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale1x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x3_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl128 = swl & ~127; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x128bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q15} ;" // 128 bytes + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " cmp %0, lr ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 8f ;" + " tst %3, #64 ;" + " beq 4f ;" + " vldmia %0!, {q8-q11} ;" // 64 bytes + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " vstmia r10!, {q8-q11} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "4: tst %3, #32 ;" + " beq 5f ;" + " vldmia %0!, {q12-q13} ;" // 32 bytes + " vstmia %1!, {q12-q13} ;" + " vstmia r9!, {q12-q13} ;" + " vstmia r10!, {q12-q13} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "5: tst %3, #16 ;" + " beq 6f ;" + " vldmia %0!, {q14} ;" // 16 bytes + " vstmia %1!, {q14} ;" + " vstmia r9!, {q14} ;" + " vstmia r10!, {q14} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "6: tst %3, #8 ;" + " beq 7f ;" + " vldmia %0!, {d30} ;" // 8 bytes + " vstmia %1!, {d30} ;" + " vstmia r9!, {d30} ;" + " vstmia r10!, {d30} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "7: ldr lr, [%0],#4 ;" // 4 bytes + " str lr, [%1],#4 ;" + " str lr, [r9] ;" + " str lr, [r10] ;" + "8: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl128), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale1x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw*sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale1x4_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl128 = swl & ~127; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*4 - swl; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x128bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " add r11, r10, %7 ;" // r11 = 4x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q15} ;" // 128 bytes + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " vstmia r11!, {q8-q15} ;" + " cmp %0, lr ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 8f ;" + " tst %3, #64 ;" + " beq 4f ;" + " vldmia %0!, {q8-q11} ;" // 64 bytes + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " vstmia r10!, {q8-q11} ;" + " vstmia r11!, {q8-q11} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "4: tst %3, #32 ;" + " beq 5f ;" + " vldmia %0!, {q12-q13} ;" // 32 bytes + " vstmia %1!, {q12-q13} ;" + " vstmia r9!, {q12-q13} ;" + " vstmia r10!, {q12-q13} ;" + " vstmia r11!, {q12-q13} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "5: tst %3, #16 ;" + " beq 6f ;" + " vldmia %0!, {q14} ;" // 16 bytes + " vstmia %1!, {q14} ;" + " vstmia r9!, {q14} ;" + " vstmia r10!, {q14} ;" + " vstmia r11!, {q14} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "6: tst %3, #8 ;" + " beq 7f ;" + " vldmia %0!, {d30} ;" // 8 bytes + " vstmia %1!, {d30} ;" + " vstmia r9!, {d30} ;" + " vstmia r10!, {d30} ;" + " vstmia r11!, {d30} ;" + " cmp %0, r8 ;" + " beq 8f ;" + "7: ldr lr, [%0],#4 ;" // 4 bytes + " str lr, [%1],#4 ;" + " str lr, [r9] ;" + " str lr, [r10] ;" + " str lr, [r11] ;" + "8: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl128), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","r11","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale1x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale1x1_n32, &scale1x2_n32, &scale1x3_n32, &scale1x4_n32 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale2x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw * sizeof(uint16_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x_c16(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x1_c16(src,dst,sw,sh,sp,dp); return; } uint32_t swl64 = swl & ~63; - uint32_t swrest = swl & 63; uint32_t sadd = sp - swl; - uint32_t dadd = dp*2 - swl*2; + uint32_t dadd = dp - swl*2; uint8_t* finofs = (uint8_t*)src + (sp*sh); asm volatile ( "1: add lr, %0, %2 ;" // lr = x64bytes offset - " add r9, %0, %3 ;" // r9 = lineend offset - " add r10, %1, %7 ;" // r10 = 2x line offset + " add r8, %0, %3 ;" // r8 = lineend offset " cmp %0, lr ;" " beq 3f ;" "2: vldmia %0!, {q8-q11} ;" // 32 pixels 64 bytes @@ -156,11 +936,10 @@ void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 " vext.16 d16, d1,d0,#2 ;" " cmp %0, lr ;" " vstmia %1!, {q8-q15} ;" - " vstmia r10!, {q8-q15} ;" " bne 2b ;" - "3: cmp %0, r9 ;" + "3: cmp %0, r8 ;" " beq 5f ;" - " tst %8, #32 ;" + " tst %3, #32 ;" " beq 4f ;" " vldmia %0!,{q8-q9} ;" // 16 pixels " vdup.16 d0, d19[3] ;" @@ -187,14 +966,250 @@ void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 " vdup.16 d0, d16[1] ;" " vdup.16 d1, d16[0] ;" " vext.16 d16, d1,d0,#2 ;" - " cmp %0, r9 ;" + " cmp %0, r8 ;" " vstmia %1!, {q8-q11} ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " orr lr, lr, lsl #16 ;" + " cmp %0, r8 ;" + " str lr, [%1],#4 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","lr","q0","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale2x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x2_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl64 = swl & ~63; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl*2; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x64bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q11} ;" // 32 pixels 64 bytes + " vdup.16 d0, d23[3] ;" + " vdup.16 d1, d23[2] ;" + " vext.16 d31, d1,d0,#2 ;" + " vdup.16 d0, d23[1] ;" + " vdup.16 d1, d23[0] ;" + " vext.16 d30, d1,d0,#2 ;" + " vdup.16 d0, d22[3] ;" + " vdup.16 d1, d22[2] ;" + " vext.16 d29, d1,d0,#2 ;" + " vdup.16 d0, d22[1] ;" + " vdup.16 d1, d22[0] ;" + " vext.16 d28, d1,d0,#2 ;" + " vdup.16 d0, d21[3] ;" + " vdup.16 d1, d21[2] ;" + " vext.16 d27, d1,d0,#2 ;" + " vdup.16 d0, d21[1] ;" + " vdup.16 d1, d21[0] ;" + " vext.16 d26, d1,d0,#2 ;" + " vdup.16 d0, d20[3] ;" + " vdup.16 d1, d20[2] ;" + " vext.16 d25, d1,d0,#2 ;" + " vdup.16 d0, d20[1] ;" + " vdup.16 d1, d20[0] ;" + " vext.16 d24, d1,d0,#2 ;" + " vdup.16 d0, d19[3] ;" + " vdup.16 d1, d19[2] ;" + " vext.16 d23, d1,d0,#2 ;" + " vdup.16 d0, d19[1] ;" + " vdup.16 d1, d19[0] ;" + " vext.16 d22, d1,d0,#2 ;" + " vdup.16 d0, d18[3] ;" + " vdup.16 d1, d18[2] ;" + " vext.16 d21, d1,d0,#2 ;" + " vdup.16 d0, d18[1] ;" + " vdup.16 d1, d18[0] ;" + " vext.16 d20, d1,d0,#2 ;" + " vdup.16 d0, d17[3] ;" + " vdup.16 d1, d17[2] ;" + " vext.16 d19, d1,d0,#2 ;" + " vdup.16 d0, d17[1] ;" + " vdup.16 d1, d17[0] ;" + " vext.16 d18, d1,d0,#2 ;" + " vdup.16 d0, d16[3] ;" + " vdup.16 d1, d16[2] ;" + " vext.16 d17, d1,d0,#2 ;" + " vdup.16 d0, d16[1] ;" + " vdup.16 d1, d16[0] ;" + " vext.16 d16, d1,d0,#2 ;" + " cmp %0, lr ;" + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + " tst %3, #32 ;" + " beq 4f ;" + " vldmia %0!,{q8-q9} ;" // 16 pixels + " vdup.16 d0, d19[3] ;" + " vdup.16 d1, d19[2] ;" + " vext.16 d23, d1,d0,#2 ;" + " vdup.16 d0, d19[1] ;" + " vdup.16 d1, d19[0] ;" + " vext.16 d22, d1,d0,#2 ;" + " vdup.16 d0, d18[3] ;" + " vdup.16 d1, d18[2] ;" + " vext.16 d21, d1,d0,#2 ;" + " vdup.16 d0, d18[1] ;" + " vdup.16 d1, d18[0] ;" + " vext.16 d20, d1,d0,#2 ;" + " vdup.16 d0, d17[3] ;" + " vdup.16 d1, d17[2] ;" + " vext.16 d19, d1,d0,#2 ;" + " vdup.16 d0, d17[1] ;" + " vdup.16 d1, d17[0] ;" + " vext.16 d18, d1,d0,#2 ;" + " vdup.16 d0, d16[3] ;" + " vdup.16 d1, d16[2] ;" + " vext.16 d17, d1,d0,#2 ;" + " vdup.16 d0, d16[1] ;" + " vdup.16 d1, d16[0] ;" + " vext.16 d16, d1,d0,#2 ;" + " cmp %0, r8 ;" + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " orr lr, lr, lsl #16 ;" + " cmp %0, r8 ;" + " str lr, [%1],#4 ;" + " str lr, [r9],#4 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q0","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale2x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x3_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl64 = swl & ~63; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl*2; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x64bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q11} ;" // 32 pixels 64 bytes + " vdup.16 d0, d23[3] ;" + " vdup.16 d1, d23[2] ;" + " vext.16 d31, d1,d0,#2 ;" + " vdup.16 d0, d23[1] ;" + " vdup.16 d1, d23[0] ;" + " vext.16 d30, d1,d0,#2 ;" + " vdup.16 d0, d22[3] ;" + " vdup.16 d1, d22[2] ;" + " vext.16 d29, d1,d0,#2 ;" + " vdup.16 d0, d22[1] ;" + " vdup.16 d1, d22[0] ;" + " vext.16 d28, d1,d0,#2 ;" + " vdup.16 d0, d21[3] ;" + " vdup.16 d1, d21[2] ;" + " vext.16 d27, d1,d0,#2 ;" + " vdup.16 d0, d21[1] ;" + " vdup.16 d1, d21[0] ;" + " vext.16 d26, d1,d0,#2 ;" + " vdup.16 d0, d20[3] ;" + " vdup.16 d1, d20[2] ;" + " vext.16 d25, d1,d0,#2 ;" + " vdup.16 d0, d20[1] ;" + " vdup.16 d1, d20[0] ;" + " vext.16 d24, d1,d0,#2 ;" + " vdup.16 d0, d19[3] ;" + " vdup.16 d1, d19[2] ;" + " vext.16 d23, d1,d0,#2 ;" + " vdup.16 d0, d19[1] ;" + " vdup.16 d1, d19[0] ;" + " vext.16 d22, d1,d0,#2 ;" + " vdup.16 d0, d18[3] ;" + " vdup.16 d1, d18[2] ;" + " vext.16 d21, d1,d0,#2 ;" + " vdup.16 d0, d18[1] ;" + " vdup.16 d1, d18[0] ;" + " vext.16 d20, d1,d0,#2 ;" + " vdup.16 d0, d17[3] ;" + " vdup.16 d1, d17[2] ;" + " vext.16 d19, d1,d0,#2 ;" + " vdup.16 d0, d17[1] ;" + " vdup.16 d1, d17[0] ;" + " vext.16 d18, d1,d0,#2 ;" + " vdup.16 d0, d16[3] ;" + " vdup.16 d1, d16[2] ;" + " vext.16 d17, d1,d0,#2 ;" + " vdup.16 d0, d16[1] ;" + " vdup.16 d1, d16[0] ;" + " vext.16 d16, d1,d0,#2 ;" + " cmp %0, lr ;" + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + " tst %3, #32 ;" + " beq 4f ;" + " vldmia %0!,{q8-q9} ;" // 16 pixels + " vdup.16 d0, d19[3] ;" + " vdup.16 d1, d19[2] ;" + " vext.16 d23, d1,d0,#2 ;" + " vdup.16 d0, d19[1] ;" + " vdup.16 d1, d19[0] ;" + " vext.16 d22, d1,d0,#2 ;" + " vdup.16 d0, d18[3] ;" + " vdup.16 d1, d18[2] ;" + " vext.16 d21, d1,d0,#2 ;" + " vdup.16 d0, d18[1] ;" + " vdup.16 d1, d18[0] ;" + " vext.16 d20, d1,d0,#2 ;" + " vdup.16 d0, d17[3] ;" + " vdup.16 d1, d17[2] ;" + " vext.16 d19, d1,d0,#2 ;" + " vdup.16 d0, d17[1] ;" + " vdup.16 d1, d17[0] ;" + " vext.16 d18, d1,d0,#2 ;" + " vdup.16 d0, d16[3] ;" + " vdup.16 d1, d16[2] ;" + " vext.16 d17, d1,d0,#2 ;" + " vdup.16 d0, d16[1] ;" + " vdup.16 d1, d16[0] ;" + " vext.16 d16, d1,d0,#2 ;" + " cmp %0, r8 ;" + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" " vstmia r10!, {q8-q11} ;" " beq 5f ;" "4: ldrh lr, [%0],#2 ;" // rest " orr lr, lr, lsl #16 ;" - " cmp %0, r9 ;" + " cmp %0, r8 ;" " str lr, [%1],#4 ;" + " str lr, [r9],#4 ;" " str lr, [r10],#4 ;" " bne 4b ;" "5: add %0, %0, %4 ;" @@ -202,16 +1217,199 @@ void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 " cmp %0, %6 ;" " bne 1b " : "+r"(src), "+r"(dst) - : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp), "r"(swrest) - : "r9","r10","lr","q0","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q0","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" ); } -void scale2x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale2x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x3_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl64 = swl & ~63; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*4 - swl*2; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x64bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " add r11, r10, %7 ;" // r11 = 4x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q11} ;" // 32 pixels 64 bytes + " vdup.16 d0, d23[3] ;" + " vdup.16 d1, d23[2] ;" + " vext.16 d31, d1,d0,#2 ;" + " vdup.16 d0, d23[1] ;" + " vdup.16 d1, d23[0] ;" + " vext.16 d30, d1,d0,#2 ;" + " vdup.16 d0, d22[3] ;" + " vdup.16 d1, d22[2] ;" + " vext.16 d29, d1,d0,#2 ;" + " vdup.16 d0, d22[1] ;" + " vdup.16 d1, d22[0] ;" + " vext.16 d28, d1,d0,#2 ;" + " vdup.16 d0, d21[3] ;" + " vdup.16 d1, d21[2] ;" + " vext.16 d27, d1,d0,#2 ;" + " vdup.16 d0, d21[1] ;" + " vdup.16 d1, d21[0] ;" + " vext.16 d26, d1,d0,#2 ;" + " vdup.16 d0, d20[3] ;" + " vdup.16 d1, d20[2] ;" + " vext.16 d25, d1,d0,#2 ;" + " vdup.16 d0, d20[1] ;" + " vdup.16 d1, d20[0] ;" + " vext.16 d24, d1,d0,#2 ;" + " vdup.16 d0, d19[3] ;" + " vdup.16 d1, d19[2] ;" + " vext.16 d23, d1,d0,#2 ;" + " vdup.16 d0, d19[1] ;" + " vdup.16 d1, d19[0] ;" + " vext.16 d22, d1,d0,#2 ;" + " vdup.16 d0, d18[3] ;" + " vdup.16 d1, d18[2] ;" + " vext.16 d21, d1,d0,#2 ;" + " vdup.16 d0, d18[1] ;" + " vdup.16 d1, d18[0] ;" + " vext.16 d20, d1,d0,#2 ;" + " vdup.16 d0, d17[3] ;" + " vdup.16 d1, d17[2] ;" + " vext.16 d19, d1,d0,#2 ;" + " vdup.16 d0, d17[1] ;" + " vdup.16 d1, d17[0] ;" + " vext.16 d18, d1,d0,#2 ;" + " vdup.16 d0, d16[3] ;" + " vdup.16 d1, d16[2] ;" + " vext.16 d17, d1,d0,#2 ;" + " vdup.16 d0, d16[1] ;" + " vdup.16 d1, d16[0] ;" + " vext.16 d16, d1,d0,#2 ;" + " cmp %0, lr ;" + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " vstmia r11!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + " tst %3, #32 ;" + " beq 4f ;" + " vldmia %0!,{q8-q9} ;" // 16 pixels + " vdup.16 d0, d19[3] ;" + " vdup.16 d1, d19[2] ;" + " vext.16 d23, d1,d0,#2 ;" + " vdup.16 d0, d19[1] ;" + " vdup.16 d1, d19[0] ;" + " vext.16 d22, d1,d0,#2 ;" + " vdup.16 d0, d18[3] ;" + " vdup.16 d1, d18[2] ;" + " vext.16 d21, d1,d0,#2 ;" + " vdup.16 d0, d18[1] ;" + " vdup.16 d1, d18[0] ;" + " vext.16 d20, d1,d0,#2 ;" + " vdup.16 d0, d17[3] ;" + " vdup.16 d1, d17[2] ;" + " vext.16 d19, d1,d0,#2 ;" + " vdup.16 d0, d17[1] ;" + " vdup.16 d1, d17[0] ;" + " vext.16 d18, d1,d0,#2 ;" + " vdup.16 d0, d16[3] ;" + " vdup.16 d1, d16[2] ;" + " vext.16 d17, d1,d0,#2 ;" + " vdup.16 d0, d16[1] ;" + " vdup.16 d1, d16[0] ;" + " vext.16 d16, d1,d0,#2 ;" + " cmp %0, r8 ;" + " vstmia %1!, {q8-q11} ;" + " vstmia r9!, {q8-q11} ;" + " vstmia r10!, {q8-q11} ;" + " vstmia r11!, {q8-q11} ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " orr lr, lr, lsl #16 ;" + " cmp %0, r8 ;" + " str lr, [%1],#4 ;" + " str lr, [r9],#4 ;" + " str lr, [r10],#4 ;" + " str lr, [r11],#4 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","r11","lr","q0","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale2x1_n16, &scale2x2_n16, &scale2x3_n16, &scale2x4_n16 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale2x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw * sizeof(uint32_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x_c32(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x1_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl64 = swl & ~63; + uint32_t sadd = sp - swl; + uint32_t dadd = dp - swl*2; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x64bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q11} ;" // 16 pixels 64 bytes + " vdup.32 d31, d23[1] ;" + " vdup.32 d30, d23[0] ;" + " vdup.32 d29, d22[1] ;" + " vdup.32 d28, d22[0] ;" + " vdup.32 d27, d21[1] ;" + " vdup.32 d26, d21[0] ;" + " vdup.32 d25, d20[1] ;" + " vdup.32 d24, d20[0] ;" + " vdup.32 d23, d19[1] ;" + " vdup.32 d22, d19[0] ;" + " vdup.32 d21, d18[1] ;" + " vdup.32 d20, d18[0] ;" + " vdup.32 d19, d17[1] ;" + " vdup.32 d18, d17[0] ;" + " vdup.32 d17, d16[1] ;" + " vdup.32 d16, d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale2x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x2_c32(src,dst,sw,sh,sp,dp); return; } uint32_t swl64 = swl & ~63; uint32_t sadd = sp - swl; uint32_t dadd = dp*2 - swl*2; @@ -261,22 +1459,142 @@ void scale2x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 ); } -void scale3x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale2x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x3_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl64 = swl & ~63; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl*2; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x64bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q11} ;" // 16 pixels 64 bytes + " vdup.32 d31, d23[1] ;" + " vdup.32 d30, d23[0] ;" + " vdup.32 d29, d22[1] ;" + " vdup.32 d28, d22[0] ;" + " vdup.32 d27, d21[1] ;" + " vdup.32 d26, d21[0] ;" + " vdup.32 d25, d20[1] ;" + " vdup.32 d24, d20[0] ;" + " vdup.32 d23, d19[1] ;" + " vdup.32 d22, d19[0] ;" + " vdup.32 d21, d18[1] ;" + " vdup.32 d20, d18[0] ;" + " vdup.32 d19, d17[1] ;" + " vdup.32 d18, d17[0] ;" + " vdup.32 d17, d16[1] ;" + " vdup.32 d16, d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " vstmia r9!, {d16} ;" + " vstmia r10!, {d16} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale2x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*2; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale2x4_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl64 = swl & ~63; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*4 - swl*2; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x64bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " add r11, r10, %7 ;" // r11 = 4x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q11} ;" // 16 pixels 64 bytes + " vdup.32 d31, d23[1] ;" + " vdup.32 d30, d23[0] ;" + " vdup.32 d29, d22[1] ;" + " vdup.32 d28, d22[0] ;" + " vdup.32 d27, d21[1] ;" + " vdup.32 d26, d21[0] ;" + " vdup.32 d25, d20[1] ;" + " vdup.32 d24, d20[0] ;" + " vdup.32 d23, d19[1] ;" + " vdup.32 d22, d19[0] ;" + " vdup.32 d21, d18[1] ;" + " vdup.32 d20, d18[0] ;" + " vdup.32 d19, d17[1] ;" + " vdup.32 d18, d17[0] ;" + " vdup.32 d17, d16[1] ;" + " vdup.32 d16, d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!, {q8-q15} ;" + " vstmia r9!, {q8-q15} ;" + " vstmia r10!, {q8-q15} ;" + " vstmia r11!, {q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " vstmia r9!, {d16} ;" + " vstmia r10!, {d16} ;" + " vstmia r11!, {d16} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl64), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","r11","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale2x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale2x1_n32, &scale2x2_n32, &scale2x3_n32, &scale2x4_n32 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale3x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw * sizeof(uint16_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x_c16(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x1_c16(src,dst,sw,sh,sp,dp); return; } uint32_t swl32 = swl & ~31; uint32_t sadd = sp - swl; uint32_t dadd = dp - swl*3; - uint32_t dwl = swl*3; - uint32_t dwl128 = dwl & ~127; - uint32_t dwrest = dwl & 127; uint8_t* finofs = (uint8_t*)src + (sp*sh); asm volatile ( - "1: mov r11,%1 ;" // dst push - " add lr, %0, %2 ;" // lr = x32bytes offset - " add r10, %0, %3 ;" // r10 = lineend offset + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset " cmp %0, lr ;" " beq 3f ;" "2: vldmia %0!, {q8-q9} ;" // 16 pixels 32 bytes @@ -311,74 +1629,259 @@ void scale3x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 " cmp %0, lr ;" " vstmia %1!, {q8-q13} ;" " bne 2b ;" - "3: cmp %0, r10 ;" + "3: cmp %0, r8 ;" " beq 5f ;" "4: ldrh lr, [%0],#2 ;" // rest " orr lr, lr, lsl #16 ;" - " cmp %0, r10 ;" + " cmp %0, r8 ;" " str lr, [%1],#4 ;" " strh lr, [%1],#2 ;" " bne 4b ;" - "5: add %0, %4 ;" - " add %1, %5 ;" - " mov r12, %1 ;" // r12 = 2x line offset - " add %1, %8 ;" // - " add %1, %5 ;" // %1 = 3x line offset - " add lr, r11, %7 ;" // lr = x128bytes offset - " add r10, r11, %8 ;" // r10 = lineend offset - " cmp r11, lr ;" - " beq 7f ;" - "6: vldmia r11!, {q8-q15} ;" // 64 pixels 128 bytes - " vstmia r12!, {q8-q15} ;" - " vstmia %1!, {q8-q15} ;" - " cmp r11, lr ;" - " bne 6b ;" - "7: cmp r11, r10 ;" - " beq 10f ;" - " tst %9, #64 ;" - " beq 8f ;" - " vldmia r11!, {q8-q11} ;" // 32 pixels - " vstmia r12!, {q8-q11} ;" - " vstmia %1!, {q8-q11} ;" - " cmp r11, r10 ;" - " beq 10f ;" - "8: tst %9, #32 ;" - " beq 9f ;" - " vldmia r11!, {q8-q9} ;" // 16 pixels - " vstmia r12!, {q8-q9} ;" - " vstmia %1!, {q8-q9} ;" - " cmp r11, r10 ;" - " beq 10f ;" - "9: ldrh lr, [r11],#2 ;" // rest - " strh lr, [r12],#2 ;" - " strh lr, [%1],#2 ;" - " cmp r11, r10 ;" - " bne 9b ;" - "10: add %1, %5 ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" " cmp %0, %6 ;" " bne 1b " : "+r"(src), "+r"(dst) - : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dwl128), "r"(dwl), "r"(dwrest) - : "r10","r11","r12","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" ); } -void scale3x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale3x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x2_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl*3; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q9} ;" // 16 pixels 32 bytes + " vdup.16 d31, d19[3] ;" // FFFF + " vdup.16 d30, d19[2] ;" // EEEE + " vdup.16 d29, d19[1] ;" // DDDD + " vdup.16 d28, d19[0] ;" // CCCC + " vext.16 d27, d30,d31,#3 ;" // EFFF + " vext.16 d26, d29,d30,#2 ;" // DDEE + " vext.16 d25, d28,d29,#1 ;" // CCCD + " vdup.16 d31, d18[3] ;" // BBBB + " vdup.16 d30, d18[2] ;" // AAAA + " vdup.16 d29, d18[1] ;" // 9999 + " vdup.16 d28, d18[0] ;" // 8888 + " vext.16 d24, d30,d31,#3 ;" // ABBB + " vext.16 d23, d29,d30,#2 ;" // 99AA + " vext.16 d22, d28,d29,#1 ;" // 8889 + " vdup.16 d31, d17[3] ;" // 7777 + " vdup.16 d30, d17[2] ;" // 6666 + " vdup.16 d29, d17[1] ;" // 5555 + " vdup.16 d28, d17[0] ;" // 4444 + " vext.16 d21, d30,d31,#3 ;" // 6777 + " vext.16 d20, d29,d30,#2 ;" // 5566 + " vext.16 d19, d28,d29,#1 ;" // 4445 + " vdup.16 d31, d16[3] ;" // 3333 + " vdup.16 d30, d16[2] ;" // 2222 + " vdup.16 d29, d16[1] ;" // 1111 + " vdup.16 d28, d16[0] ;" // 0000 + " vext.16 d18, d30,d31,#3 ;" // 2333 + " vext.16 d17, d29,d30,#2 ;" // 1122 + " vext.16 d16, d28,d29,#1 ;" // 0001 + " cmp %0, lr ;" + " vstmia %1!, {q8-q13} ;" + " vstmia r9!, {q8-q13} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " orr lr, lr, lsl #16 ;" + " cmp %0, r8 ;" + " str lr, [%1],#4 ;" + " strh lr, [%1],#2 ;" + " str lr, [r9],#4 ;" + " strh lr, [r9],#2 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale3x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x3_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl*3; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q9} ;" // 16 pixels 32 bytes + " vdup.16 d31, d19[3] ;" // FFFF + " vdup.16 d30, d19[2] ;" // EEEE + " vdup.16 d29, d19[1] ;" // DDDD + " vdup.16 d28, d19[0] ;" // CCCC + " vext.16 d27, d30,d31,#3 ;" // EFFF + " vext.16 d26, d29,d30,#2 ;" // DDEE + " vext.16 d25, d28,d29,#1 ;" // CCCD + " vdup.16 d31, d18[3] ;" // BBBB + " vdup.16 d30, d18[2] ;" // AAAA + " vdup.16 d29, d18[1] ;" // 9999 + " vdup.16 d28, d18[0] ;" // 8888 + " vext.16 d24, d30,d31,#3 ;" // ABBB + " vext.16 d23, d29,d30,#2 ;" // 99AA + " vext.16 d22, d28,d29,#1 ;" // 8889 + " vdup.16 d31, d17[3] ;" // 7777 + " vdup.16 d30, d17[2] ;" // 6666 + " vdup.16 d29, d17[1] ;" // 5555 + " vdup.16 d28, d17[0] ;" // 4444 + " vext.16 d21, d30,d31,#3 ;" // 6777 + " vext.16 d20, d29,d30,#2 ;" // 5566 + " vext.16 d19, d28,d29,#1 ;" // 4445 + " vdup.16 d31, d16[3] ;" // 3333 + " vdup.16 d30, d16[2] ;" // 2222 + " vdup.16 d29, d16[1] ;" // 1111 + " vdup.16 d28, d16[0] ;" // 0000 + " vext.16 d18, d30,d31,#3 ;" // 2333 + " vext.16 d17, d29,d30,#2 ;" // 1122 + " vext.16 d16, d28,d29,#1 ;" // 0001 + " cmp %0, lr ;" + " vstmia %1!, {q8-q13} ;" + " vstmia r9!, {q8-q13} ;" + " vstmia r10!, {q8-q13} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " orr lr, lr, lsl #16 ;" + " cmp %0, r8 ;" + " str lr, [%1],#4 ;" + " strh lr, [%1],#2 ;" + " str lr, [r9],#4 ;" + " strh lr, [r9],#2 ;" + " str lr, [r10],#4 ;" + " strh lr, [r10],#2 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale3x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x4_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*4 - swl*3; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " add r11, r10, %7 ;" // r11 = 4x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!, {q8-q9} ;" // 16 pixels 32 bytes + " vdup.16 d31, d19[3] ;" // FFFF + " vdup.16 d30, d19[2] ;" // EEEE + " vdup.16 d29, d19[1] ;" // DDDD + " vdup.16 d28, d19[0] ;" // CCCC + " vext.16 d27, d30,d31,#3 ;" // EFFF + " vext.16 d26, d29,d30,#2 ;" // DDEE + " vext.16 d25, d28,d29,#1 ;" // CCCD + " vdup.16 d31, d18[3] ;" // BBBB + " vdup.16 d30, d18[2] ;" // AAAA + " vdup.16 d29, d18[1] ;" // 9999 + " vdup.16 d28, d18[0] ;" // 8888 + " vext.16 d24, d30,d31,#3 ;" // ABBB + " vext.16 d23, d29,d30,#2 ;" // 99AA + " vext.16 d22, d28,d29,#1 ;" // 8889 + " vdup.16 d31, d17[3] ;" // 7777 + " vdup.16 d30, d17[2] ;" // 6666 + " vdup.16 d29, d17[1] ;" // 5555 + " vdup.16 d28, d17[0] ;" // 4444 + " vext.16 d21, d30,d31,#3 ;" // 6777 + " vext.16 d20, d29,d30,#2 ;" // 5566 + " vext.16 d19, d28,d29,#1 ;" // 4445 + " vdup.16 d31, d16[3] ;" // 3333 + " vdup.16 d30, d16[2] ;" // 2222 + " vdup.16 d29, d16[1] ;" // 1111 + " vdup.16 d28, d16[0] ;" // 0000 + " vext.16 d18, d30,d31,#3 ;" // 2333 + " vext.16 d17, d29,d30,#2 ;" // 1122 + " vext.16 d16, d28,d29,#1 ;" // 0001 + " cmp %0, lr ;" + " vstmia %1!, {q8-q13} ;" + " vstmia r9!, {q8-q13} ;" + " vstmia r10!, {q8-q13} ;" + " vstmia r11!, {q8-q13} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " orr lr, lr, lsl #16 ;" + " cmp %0, r8 ;" + " str lr, [%1],#4 ;" + " strh lr, [%1],#2 ;" + " str lr, [r9],#4 ;" + " strh lr, [r9],#2 ;" + " str lr, [r10],#4 ;" + " strh lr, [r10],#2 ;" + " str lr, [r11],#4 ;" + " strh lr, [r11],#2 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","r11","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale3x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale3x1_n16, &scale3x2_n16, &scale3x3_n16, &scale3x4_n16 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale3x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw * sizeof(uint32_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x_c32(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x1_c32(src,dst,sw,sh,sp,dp); return; } uint32_t swl32 = swl & ~31; uint32_t sadd = sp - swl; uint32_t dadd = dp - swl*3; - uint32_t dwl = swl*3; - uint32_t dwl128 = dwl & ~127; - uint32_t dwrest = dwl & 127; uint8_t* finofs = (uint8_t*)src + (sp*sh); asm volatile ( - "1: mov r11,%1 ;" // dst push - " add lr, %0, %2 ;" // lr = x32bytes offset - " add r10, %0, %3 ;" // r10 = lineend offset + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset " cmp %0, lr ;" " beq 3f ;" "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes @@ -399,63 +1902,372 @@ void scale3x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 " cmp %0, lr ;" " vstmia %1!,{q8-q13} ;" " bne 2b ;" - "3: cmp %0, r10 ;" + "3: cmp %0, r8 ;" " beq 5f ;" "4: ldr lr, [%0],#4 ;" // rest " vdup.32 d16, lr ;" - " cmp %0, r10 ;" + " cmp %0, r8 ;" " vstmia %1!, {d16} ;" " str lr, [%1],#4 ;" " bne 4b ;" - "5: add %0, %4 ;" - " add %1, %5 ;" - " mov r12, %1 ;" // r12 = 2x line offset - " add %1, %8 ;" // - " add %1, %5 ;" // %1 = 3x line offset - " add lr, r11, %7 ;" // lr = x128bytes offset - " add r10, r11, %8 ;" // r10 = lineend offset - " cmp r11, lr ;" - " beq 7f ;" - "6: vldmia r11!, {q8-q15} ;" // 32 pixels 128 bytes - " vstmia r12!, {q8-q15} ;" - " vstmia %1!, {q8-q15} ;" - " cmp r11, lr ;" - " bne 6b ;" - "7: cmp r11, r10 ;" - " beq 10f ;" - " tst %9, #64 ;" - " beq 8f ;" - " vldmia r11!, {q8-q11} ;" // 16 pixels - " vstmia r12!, {q8-q11} ;" - " vstmia %1!, {q8-q11} ;" - " cmp r11, r10 ;" - " beq 10f ;" - "8: tst %9, #32 ;" - " beq 9f ;" - " vldmia r11!, {q8-q9} ;" // 8 pixels - " vstmia r12!, {q8-q9} ;" - " vstmia %1!, {q8-q9} ;" - " cmp r11, r10 ;" - " beq 10f ;" - "9: ldr lr, [r11],#4 ;" // rest - " str lr, [r12],#4 ;" - " str lr, [%1],#4 ;" - " cmp r11, r10 ;" - " bne 9b ;" - "10: add %1, %5 ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" " cmp %0, %6 ;" " bne 1b " : "+r"(src), "+r"(dst) - : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dwl128), "r"(dwl), "r"(dwrest) - : "r10","r11","r12","lr","q0","q1","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" ); } -void scale4x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale3x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x2_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl*3; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes + " vdup.32 q15, d19[1] ;" // 7777 + " vdup.32 q14, d19[0] ;" // 6666 + " vdup.32 q1, d18[1] ;" // 5555 + " vdup.32 q0, d18[0] ;" // 4444 + " vext.32 q13, q14,q15,#3 ;" // 6777 + " vext.32 q12, q1,q14,#2 ;" // 5566 + " vext.32 q11, q0,q1,#1 ;" // 4445 + " vdup.32 q15, d17[1] ;" // 3333 + " vdup.32 q14, d17[0] ;" // 2222 + " vdup.32 q1, d16[1] ;" // 1111 + " vdup.32 q0, d16[0] ;" // 0000 + " vext.32 q10, q14,q15,#3 ;" // 2333 + " vext.32 q9, q1,q14,#2 ;" // 1122 + " vext.32 q8, q0,q1,#1 ;" // 0001 + " cmp %0, lr ;" + " vstmia %1!,{q8-q13} ;" + " vstmia r9!,{q8-q13} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " str lr, [%1],#4 ;" + " vstmia r9!, {d16} ;" + " str lr, [r9],#4 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale3x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x3_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl*3; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes + " vdup.32 q15, d19[1] ;" // 7777 + " vdup.32 q14, d19[0] ;" // 6666 + " vdup.32 q1, d18[1] ;" // 5555 + " vdup.32 q0, d18[0] ;" // 4444 + " vext.32 q13, q14,q15,#3 ;" // 6777 + " vext.32 q12, q1,q14,#2 ;" // 5566 + " vext.32 q11, q0,q1,#1 ;" // 4445 + " vdup.32 q15, d17[1] ;" // 3333 + " vdup.32 q14, d17[0] ;" // 2222 + " vdup.32 q1, d16[1] ;" // 1111 + " vdup.32 q0, d16[0] ;" // 0000 + " vext.32 q10, q14,q15,#3 ;" // 2333 + " vext.32 q9, q1,q14,#2 ;" // 1122 + " vext.32 q8, q0,q1,#1 ;" // 0001 + " cmp %0, lr ;" + " vstmia %1!,{q8-q13} ;" + " vstmia r9!,{q8-q13} ;" + " vstmia r10!,{q8-q13} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " str lr, [%1],#4 ;" + " vstmia r9!, {d16} ;" + " str lr, [r9],#4 ;" + " vstmia r10!, {d16} ;" + " str lr, [r10],#4 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale3x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*3; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale3x4_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*4 - swl*3; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " add r11, r10, %7 ;" // r11 = 4x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes + " vdup.32 q15, d19[1] ;" // 7777 + " vdup.32 q14, d19[0] ;" // 6666 + " vdup.32 q1, d18[1] ;" // 5555 + " vdup.32 q0, d18[0] ;" // 4444 + " vext.32 q13, q14,q15,#3 ;" // 6777 + " vext.32 q12, q1,q14,#2 ;" // 5566 + " vext.32 q11, q0,q1,#1 ;" // 4445 + " vdup.32 q15, d17[1] ;" // 3333 + " vdup.32 q14, d17[0] ;" // 2222 + " vdup.32 q1, d16[1] ;" // 1111 + " vdup.32 q0, d16[0] ;" // 0000 + " vext.32 q10, q14,q15,#3 ;" // 2333 + " vext.32 q9, q1,q14,#2 ;" // 1122 + " vext.32 q8, q0,q1,#1 ;" // 0001 + " cmp %0, lr ;" + " vstmia %1!,{q8-q13} ;" + " vstmia r9!,{q8-q13} ;" + " vstmia r10!,{q8-q13} ;" + " vstmia r11!,{q8-q13} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " str lr, [%1],#4 ;" + " vstmia r9!, {d16} ;" + " str lr, [r9],#4 ;" + " vstmia r10!, {d16} ;" + " str lr, [r10],#4 ;" + " vstmia r11!, {d16} ;" + " str lr, [r11],#4 ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","r11","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale3x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale3x1_n32, &scale3x2_n32, &scale3x3_n32, &scale3x4_n32 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale4x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw * sizeof(uint16_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x_c16(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x1_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp - swl*4; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 16 pixels 32 bytes + " vdup.16 d31,d19[3] ;" + " vdup.16 d30,d19[2] ;" + " vdup.16 d29,d19[1] ;" + " vdup.16 d28,d19[0] ;" + " vdup.16 d27,d18[3] ;" + " vdup.16 d26,d18[2] ;" + " vdup.16 d25,d18[1] ;" + " vdup.16 d24,d18[0] ;" + " vdup.16 d23,d17[3] ;" + " vdup.16 d22,d17[2] ;" + " vdup.16 d21,d17[1] ;" + " vdup.16 d20,d17[0] ;" + " vdup.16 d19,d16[3] ;" + " vdup.16 d18,d16[2] ;" + " vdup.16 d17,d16[1] ;" + " vdup.16 d16,d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!,{q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " vdup.16 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale4x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x2_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl*4; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 16 pixels 32 bytes + " vdup.16 d31,d19[3] ;" + " vdup.16 d30,d19[2] ;" + " vdup.16 d29,d19[1] ;" + " vdup.16 d28,d19[0] ;" + " vdup.16 d27,d18[3] ;" + " vdup.16 d26,d18[2] ;" + " vdup.16 d25,d18[1] ;" + " vdup.16 d24,d18[0] ;" + " vdup.16 d23,d17[3] ;" + " vdup.16 d22,d17[2] ;" + " vdup.16 d21,d17[1] ;" + " vdup.16 d20,d17[0] ;" + " vdup.16 d19,d16[3] ;" + " vdup.16 d18,d16[2] ;" + " vdup.16 d17,d16[1] ;" + " vdup.16 d16,d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!,{q8-q15} ;" + " vstmia r9!,{q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " vdup.16 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " vstmia r9!, {d16} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale4x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x3_c16(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl*4; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 16 pixels 32 bytes + " vdup.16 d31,d19[3] ;" + " vdup.16 d30,d19[2] ;" + " vdup.16 d29,d19[1] ;" + " vdup.16 d28,d19[0] ;" + " vdup.16 d27,d18[3] ;" + " vdup.16 d26,d18[2] ;" + " vdup.16 d25,d18[1] ;" + " vdup.16 d24,d18[0] ;" + " vdup.16 d23,d17[3] ;" + " vdup.16 d22,d17[2] ;" + " vdup.16 d21,d17[1] ;" + " vdup.16 d20,d17[0] ;" + " vdup.16 d19,d16[3] ;" + " vdup.16 d18,d16[2] ;" + " vdup.16 d17,d16[1] ;" + " vdup.16 d16,d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!,{q8-q15} ;" + " vstmia r9!,{q8-q15} ;" + " vstmia r10!,{q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldrh lr, [%0],#2 ;" // rest + " vdup.16 d16, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {d16} ;" + " vstmia r9!, {d16} ;" + " vstmia r10!, {d16} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale4x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint16_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x4_c16(src,dst,sw,sh,sp,dp); return; } uint32_t swl32 = swl & ~31; uint32_t sadd = sp - swl; uint32_t dadd = dp*4 - swl*4; @@ -511,11 +2323,156 @@ void scale4x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 ); } -void scale4x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale4x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale4x1_n16, &scale4x2_n16, &scale4x3_n16, &scale4x4_n16 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale4x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; uint32_t swl = sw * sizeof(uint32_t); if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x_c32(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x1_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp - swl*4; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes + " vdup.32 q15,d19[1] ;" + " vdup.32 q14,d19[0] ;" + " vdup.32 q13,d18[1] ;" + " vdup.32 q12,d18[0] ;" + " vdup.32 q11,d17[1] ;" + " vdup.32 q10,d17[0] ;" + " vdup.32 q9,d16[1] ;" + " vdup.32 q8,d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!,{q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 q8, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {q8} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale4x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x2_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*2 - swl*4; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes + " vdup.32 q15,d19[1] ;" + " vdup.32 q14,d19[0] ;" + " vdup.32 q13,d18[1] ;" + " vdup.32 q12,d18[0] ;" + " vdup.32 q11,d17[1] ;" + " vdup.32 q10,d17[0] ;" + " vdup.32 q9,d16[1] ;" + " vdup.32 q8,d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!,{q8-q15} ;" + " vstmia r9!,{q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 q8, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {q8} ;" + " vstmia r9!, {q8} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale4x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x3_c32(src,dst,sw,sh,sp,dp); return; } + uint32_t swl32 = swl & ~31; + uint32_t sadd = sp - swl; + uint32_t dadd = dp*3 - swl*4; + uint8_t* finofs = (uint8_t*)src + (sp*sh); + asm volatile ( + "1: add lr, %0, %2 ;" // lr = x32bytes offset + " add r8, %0, %3 ;" // r8 = lineend offset + " add r9, %1, %7 ;" // r9 = 2x line offset + " add r10, r9, %7 ;" // r10 = 3x line offset + " cmp %0, lr ;" + " beq 3f ;" + "2: vldmia %0!,{q8-q9} ;" // 8 pixels 32 bytes + " vdup.32 q15,d19[1] ;" + " vdup.32 q14,d19[0] ;" + " vdup.32 q13,d18[1] ;" + " vdup.32 q12,d18[0] ;" + " vdup.32 q11,d17[1] ;" + " vdup.32 q10,d17[0] ;" + " vdup.32 q9,d16[1] ;" + " vdup.32 q8,d16[0] ;" + " cmp %0, lr ;" + " vstmia %1!,{q8-q15} ;" + " vstmia r9!,{q8-q15} ;" + " vstmia r10!,{q8-q15} ;" + " bne 2b ;" + "3: cmp %0, r8 ;" + " beq 5f ;" + "4: ldr lr, [%0],#4 ;" // rest + " vdup.32 q8, lr ;" + " cmp %0, r8 ;" + " vstmia %1!, {q8} ;" + " vstmia r9!, {q8} ;" + " vstmia r10!, {q8} ;" + " bne 4b ;" + "5: add %0, %0, %4 ;" + " add %1, %1, %5 ;" + " cmp %0, %6 ;" + " bne 1b " + : "+r"(src), "+r"(dst) + : "r"(swl32), "r"(swl), "r"(sadd), "r"(dadd), "r"(finofs), "r"(dp) + : "r8","r9","r10","lr","q8","q9","q10","q11","q12","q13","q14","q15","memory","cc" + ); +} + +void scale4x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + if (!sw||!sh) return; + uint32_t swl = sw * sizeof(uint32_t); + if (!sp) { sp = swl; } if (!dp) { dp = swl*4; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale4x4_c32(src,dst,sw,sh,sp,dp); return; } uint32_t swl32 = swl & ~31; uint32_t sadd = sp - swl; uint32_t dadd = dp*4 - swl*4; @@ -563,7 +2520,14 @@ void scale4x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32 ); } -static inline void scale5x_n16line(void* src, void* dst, uint32_t swl) { +void scale4x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + const void (*func[4])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) + = { &scale4x1_n32, &scale4x2_n32, &scale4x3_n32, &scale4x4_n32 }; + if (--ymul < 4) func[ymul](src, dst, sw, sh, sp, dp); + return; +} + +void scale5x_n16line(void* src, void* dst, uint32_t swl) { asm volatile ( " bic r4, %2, #15 ;" // r4 = swl16 " add r3, %0, %2 ;" // r3 = lineend offset @@ -604,21 +2568,32 @@ static inline void scale5x_n16line(void* src, void* dst, uint32_t swl) { ); } -void scale5x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale5x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; uint32_t swl = sw * sizeof(uint16_t); uint32_t dwl = swl*5; if (!sp) { sp = swl; } if (!dp) { dp = dwl; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale5x_c16(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale5x_c16(src,dst,sw,sh,sp,dp,ymul); return; } void* __restrict dstsrc; for (; sh>0; sh--, src=(uint8_t*)src+sp) { scale5x_n16line(src, dst, swl); dstsrc = dst; dst = (uint8_t*)dst+dp; - for (uint32_t i=4; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); } } -static inline void scale5x_n32line(void* src, void* dst, uint32_t swl) { +void scale5x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n16(src, dst, sw, sh, sp, dp, 1); } +void scale5x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n16(src, dst, sw, sh, sp, dp, 2); } +void scale5x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n16(src, dst, sw, sh, sp, dp, 3); } +void scale5x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n16(src, dst, sw, sh, sp, dp, 4); } +void scale5x5_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n16(src, dst, sw, sh, sp, dp, 5); } + +void scale5x_n32line(void* src, void* dst, uint32_t swl) { asm volatile ( " bic r4, %2, #15 ;" // r4 = swl16 " add r3, %0, %2 ;" // r3 = lineend offset @@ -651,21 +2626,32 @@ static inline void scale5x_n32line(void* src, void* dst, uint32_t swl) { ); } -void scale5x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale5x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; uint32_t swl = sw * sizeof(uint32_t); uint32_t dwl = swl*5; if (!sp) { sp = swl; } if (!dp) { dp = dwl; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale5x_c32(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale5x_c32(src,dst,sw,sh,sp,dp,ymul); return; } void* __restrict dstsrc; for (; sh>0; sh--, src=(uint8_t*)src+sp) { scale5x_n32line(src, dst, swl); dstsrc = dst; dst = (uint8_t*)dst+dp; - for (uint32_t i=4; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); } } -static inline void scale6x_n16line(void* src, void* dst, uint32_t swl) { +void scale5x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n32(src, dst, sw, sh, sp, dp, 1); } +void scale5x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n32(src, dst, sw, sh, sp, dp, 2); } +void scale5x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n32(src, dst, sw, sh, sp, dp, 3); } +void scale5x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n32(src, dst, sw, sh, sp, dp, 4); } +void scale5x5_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale5x_n32(src, dst, sw, sh, sp, dp, 5); } + +void scale6x_n16line(void* src, void* dst, uint32_t swl) { asm volatile ( " bic r4, %2, #15 ;" // r4 = swl16 " add r3, %0, %2 ;" // r3 = lineend offset @@ -704,21 +2690,34 @@ static inline void scale6x_n16line(void* src, void* dst, uint32_t swl) { ); } -void scale6x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale6x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; uint32_t swl = sw * sizeof(uint16_t); uint32_t dwl = swl*6; if (!sp) { sp = swl; } if (!dp) { dp = dwl; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale6x_c16(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale6x_c16(src,dst,sw,sh,sp,dp,ymul); return; } void* __restrict dstsrc; for (; sh>0; sh--, src=(uint8_t*)src+sp) { scale6x_n16line(src, dst, swl); dstsrc = dst; dst = (uint8_t*)dst+dp; - for (uint32_t i=5; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); } } -static inline void scale6x_n32line(void* src, void* dst, uint32_t swl) { +void scale6x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n16(src, dst, sw, sh, sp, dp, 1); } +void scale6x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n16(src, dst, sw, sh, sp, dp, 2); } +void scale6x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n16(src, dst, sw, sh, sp, dp, 3); } +void scale6x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n16(src, dst, sw, sh, sp, dp, 4); } +void scale6x5_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n16(src, dst, sw, sh, sp, dp, 5); } +void scale6x6_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n16(src, dst, sw, sh, sp, dp, 6); } + +void scale6x_n32line(void* src, void* dst, uint32_t swl) { asm volatile ( " bic r4, %2, #15 ;" // r4 = swl16 " add r3, %0, %2 ;" // r3 = lineend offset @@ -750,292 +2749,83 @@ static inline void scale6x_n32line(void* src, void* dst, uint32_t swl) { ); } -void scale6x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } +void scale6x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul) { + if (!sw||!sh||!ymul) return; uint32_t swl = sw * sizeof(uint32_t); uint32_t dwl = swl*6; if (!sp) { sp = swl; } if (!dp) { dp = dwl; } - if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale6x_c32(src,dst,sw,sh,sp,dp); return; } + if ( ((uintptr_t)src&3)||((uintptr_t)dst&3)||(sp&3)||(dp&3) ) { scale6x_c32(src,dst,sw,sh,sp,dp,ymul); return; } void* __restrict dstsrc; for (; sh>0; sh--, src=(uint8_t*)src+sp) { scale6x_n32line(src, dst, swl); dstsrc = dst; dst = (uint8_t*)dst+dp; - for (uint32_t i=5; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); + for (uint32_t i=ymul-1; i>0; i--, dst=(uint8_t*)dst+dp) memcpy_neon(dst, dstsrc, dwl); } } -// -// C scalers -// +void scale6x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n32(src, dst, sw, sh, sp, dp, 1); } +void scale6x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n32(src, dst, sw, sh, sp, dp, 2); } +void scale6x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n32(src, dst, sw, sh, sp, dp, 3); } +void scale6x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n32(src, dst, sw, sh, sp, dp, 4); } +void scale6x5_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n32(src, dst, sw, sh, sp, dp, 5); } +void scale6x6_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + scale6x_n32(src, dst, sw, sh, sp, dp, 6); } -void scale1x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t swl = sw*sizeof(uint16_t); - if (!sp) { sp = swl; } if (!dp) { dp = swl*1; } - if ((swl == sp)&&(sp == dp)) memcpy(dst, src, sp*sh); - else for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp) memcpy(dst, src, swl); +static void dummy(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) {} + +void scaler_n16(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + const void (*func[6][8])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) = { + { &scale1x1_n16, &scale1x2_n16, &scale1x3_n16, &scale1x4_n16, &dummy, &dummy, &dummy, &dummy }, + { &scale2x1_n16, &scale2x2_n16, &scale2x3_n16, &scale2x4_n16, &dummy, &dummy, &dummy, &dummy }, + { &scale3x1_n16, &scale3x2_n16, &scale3x3_n16, &scale3x4_n16, &dummy, &dummy, &dummy, &dummy }, + { &scale4x1_n16, &scale4x2_n16, &scale4x3_n16, &scale4x4_n16, &dummy, &dummy, &dummy, &dummy }, + { &scale5x1_n16, &scale5x2_n16, &scale5x3_n16, &scale5x4_n16, &scale5x5_n16, &dummy, &dummy, &dummy }, + { &scale6x1_n16, &scale6x2_n16, &scale6x3_n16, &scale6x4_n16, &scale6x5_n16, &scale6x6_n16, &dummy, &dummy } + }; + if ((--xmul < 6)&&(--ymul < 6)) func[xmul][ymul](src, dst, sw, sh, sp, dp); + return; } -void scale1x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t swl = sw*sizeof(uint32_t); - if (!sp) { sp = swl; } if (!dp) { dp = swl*1; } - if ((swl == sp)&&(sp == dp)) memcpy(dst, src, sp*sh); - else for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp) memcpy(dst, src, swl); +void scaler_n32(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + const void (*func[6][8])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) = { + { &scale1x1_n32, &scale1x2_n32, &scale1x3_n32, &scale1x4_n32, &dummy, &dummy, &dummy, &dummy }, + { &scale2x1_n32, &scale2x2_n32, &scale2x3_n32, &scale2x4_n32, &dummy, &dummy, &dummy, &dummy }, + { &scale3x1_n32, &scale3x2_n32, &scale3x3_n32, &scale3x4_n32, &dummy, &dummy, &dummy, &dummy }, + { &scale4x1_n32, &scale4x2_n32, &scale4x3_n32, &scale4x4_n32, &dummy, &dummy, &dummy, &dummy }, + { &scale5x1_n32, &scale5x2_n32, &scale5x3_n32, &scale5x4_n32, &scale5x5_n32, &dummy, &dummy, &dummy }, + { &scale6x1_n32, &scale6x2_n32, &scale6x3_n32, &scale6x4_n32, &scale6x5_n32, &scale6x6_n32, &dummy, &dummy } + }; + if ((--xmul < 6)&&(--ymul < 6)) func[xmul][ymul](src, dst, sw, sh, sp, dp); + return; } -void scale2x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t x, dx, pix, dpix1, dpix2, swl = sw*sizeof(uint16_t); - if (!sp) { sp = swl; } swl*=2; if (!dp) { dp = swl; } - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*2) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x<(sw/2); x++, dx+=2) { - pix = s[x]; - dpix1=(pix & 0x0000FFFF)|(pix<<16); - dpix2=(pix & 0xFFFF0000)|(pix>>16); - d[dx] = dpix1; d[dx+1] = dpix2; - } - if (sw&1) { - uint16_t *s16 = (uint16_t*)s; - uint16_t pix16 = s16[x*2]; - d[dx] = pix16|(pix16<<16); - } - memcpy((uint8_t*)dst+dp*1, dst, swl); - } +void scaler_c16(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { + const void (*func[6][8])(void* __restrict, void* __restrict, uint32_t, uint32_t, uint32_t, uint32_t) = { + { &scale1x1_c16, &scale1x2_c16, &scale1x3_c16, &scale1x4_c16, &dummy, &dummy, &dummy, &dummy }, + { &scale2x1_c16, &scale2x2_c16, &scale2x3_c16, &scale2x4_c16, &dummy, &dummy, &dummy, &dummy }, + { &scale3x1_c16, &scale3x2_c16, &scale3x3_c16, &scale3x4_c16, &dummy, &dummy, &dummy, &dummy }, + { &scale4x1_c16, &scale4x2_c16, &scale4x3_c16, &scale4x4_c16, &dummy, &dummy, &dummy, &dummy }, + { &scale5x1_c16, &scale5x2_c16, &scale5x3_c16, &scale5x4_c16, &scale5x5_c16, &dummy, &dummy, &dummy }, + { &scale6x1_c16, &scale6x2_c16, &scale6x3_c16, &scale6x4_c16, &scale6x5_c16, &scale6x6_c16, &dummy, &dummy } + }; + if ((--xmul < 6)&&(--ymul < 6)) func[xmul][ymul](src, dst, sw, sh, sp, dp); + return; } -void scale2x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); - if (!sp) { sp = swl; } swl*=2; if (!dp) { dp = swl; } - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*2) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*3) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x<(sw/2); x++, dx+=3) { - pix = s[x]; - dpix1=(pix & 0x0000FFFF)|(pix<<16); - dpix2=(pix & 0xFFFF0000)|(pix>>16); - d[dx] = dpix1; d[dx+1] = pix; d[dx+2] = dpix2; - } - if (sw&1) { - uint16_t *s16 = (uint16_t*)s; - uint16_t *d16 = (uint16_t*)d; - uint16_t pix16 = s16[x*2]; - dpix1 = pix16|(pix16<<16); - d[dx] = dpix1; d16[(dx+1)*2] = pix16; - } - memcpy((uint8_t*)dst+dp*1, dst, swl); - memcpy((uint8_t*)dst+dp*2, dst, swl); - } -} - -void scale3x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); - if (!sp) { sp = swl; } swl*=3; if (!dp) { dp = swl; } - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*3) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*4) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x<(sw/2); x++, dx+=4) { - pix = s[x]; - dpix1=(pix & 0x0000FFFF)|(pix<<16); - dpix2=(pix & 0xFFFF0000)|(pix>>16); - d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix2; d[dx+3] = dpix2; - } - if (sw&1) { - uint16_t *s16 = (uint16_t*)s; - uint16_t pix16 = s16[x*2]; - dpix1 = pix16|(pix16<<16); - d[dx] = dpix1; d[dx+1] = dpix1; - } - memcpy((uint8_t*)dst+dp*1, dst, swl); - memcpy((uint8_t*)dst+dp*2, dst, swl); - memcpy((uint8_t*)dst+dp*3, dst, swl); - } -} - -// faster than 4x_c16 when -Ofast/-O3 and aligned width, however dp must be 4xN -void scale4x_c16b(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } if (!sp) { sp = sw*sizeof(uint16_t); } if (!dp) { dp = sw*sizeof(uint16_t)*4; } - uint32_t x, dx, pix, dpix1, dpix2, dp32 = dp / sizeof(uint32_t); - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*4) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x<(sw/2); x++, dx+=4) { - pix = s[x]; - dpix1=(pix & 0x0000FFFF)|(pix<<16); - dpix2=(pix & 0xFFFF0000)|(pix>>16); - d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix2; d[dx+3] = dpix2; - d[dp32+dx] = dpix1; d[dp32+dx+1]= dpix1; d[dp32+dx+2]= dpix2; d[dp32+dx+3]= dpix2; - d[dp32*2+dx] = dpix1; d[dp32*2+dx+1]= dpix1; d[dp32*2+dx+2]= dpix2; d[dp32*2+dx+3]= dpix2; - d[dp32*3+dx] = dpix1; d[dp32*3+dx+1]= dpix1; d[dp32*3+dx+2]= dpix2; d[dp32*3+dx+3]= dpix2; - } - if (sw&1) { - uint16_t *s16 = (uint16_t*)s; - uint16_t pix16 = s16[x*2]; - dpix1 = pix16|(pix16<<16); - d[dx] = dpix1; d[dx+1] = dpix1; - d[dp32+dx] = dpix1; d[dp32+dx+1] = dpix1; - d[dp32*2+dx] = dpix1; d[dp32*2+dx+1] = dpix1; - d[dp32*3+dx] = dpix1; d[dp32*3+dx+1] = dpix1; - } - } -} - -void scale4x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); - if (!sp) { sp = swl; } swl*=4; if (!dp) { dp = swl; } - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*4) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*4) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*5) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x<(sw/2); x++, dx+=5) { - pix = s[x]; - dpix1=(pix & 0x0000FFFF)|(pix<<16); - dpix2=(pix & 0xFFFF0000)|(pix>>16); - d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = pix; d[dx+3] = dpix2; d[dx+4] = dpix2; - } - if (sw&1) { - uint16_t *s16 = (uint16_t*)s; - uint16_t *d16 = (uint16_t*)d; - uint16_t pix16 = s16[x*2]; - dpix1 = pix16|(pix16<<16); - d[dx] = dpix1; d[dx+1] = dpix1; d16[(dx+2)*2] = pix16; - } - memcpy((uint8_t*)dst+dp*1, dst, swl); - memcpy((uint8_t*)dst+dp*2, dst, swl); - memcpy((uint8_t*)dst+dp*3, dst, swl); - memcpy((uint8_t*)dst+dp*4, dst, swl); - } -} - -void scale5x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); - if (!sp) { sp = swl; } swl*=5; if (!dp) { dp = swl; } - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*5) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*6) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x<(sw/2); x++, dx+=6) { - pix = s[x]; - dpix1=(pix & 0x0000FFFF)|(pix<<16); - dpix2=(pix & 0xFFFF0000)|(pix>>16); - d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix1; d[dx+3] = dpix2; d[dx+4] = dpix2; d[dx+5] = dpix2; - } - if (sw&1) { - uint16_t *s16 = (uint16_t*)s; - uint16_t pix16 = s16[x*2]; - dpix1 = pix16|(pix16<<16); - d[dx] = dpix1; d[dx+1] = dpix1; d[dx+2] = dpix1; - } - memcpy((uint8_t*)dst+dp*1, dst, swl); - memcpy((uint8_t*)dst+dp*2, dst, swl); - memcpy((uint8_t*)dst+dp*3, dst, swl); - memcpy((uint8_t*)dst+dp*4, dst, swl); - memcpy((uint8_t*)dst+dp*5, dst, swl); - } -} - -void scale6x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp) { - if (!sw||!sh) { return; } - uint32_t x, dx, pix, swl = sw*sizeof(uint32_t); - if (!sp) { sp = swl; } swl*=6; if (!dp) { dp = swl; } - for (; sh>0; sh--, src=(uint8_t*)src+sp, dst=(uint8_t*)dst+dp*6) { - uint32_t *s = (uint32_t* __restrict)src; - uint32_t *d = (uint32_t* __restrict)dst; - for (x=dx=0; x // -// arm NEON / C integer scalers for miyoomini +// arm NEON / C integer scalers for rg35xx // args/ src : src offset address of top left corner // dst : dst offset address of top left corner // sw : src width pixels @@ -19,36 +19,155 @@ typedef void (*scale_neon_t)(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +// Functions for generic call +// n/c = neon or c +// 16/32 = bpp +// xmul = 1,2,3,4,5,6 +// ymul = 1,2,3,4(xmul < 5) / 1,2,3,4,5(xmul == 5) / 1,2,3,4,5,6(xmul == 6) +void scaler_n16(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scaler_n32(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scaler_c16(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scaler_c32(uint32_t xmul, uint32_t ymul, void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); + +// NEON memcpy +void memcpy_neon(void* dst, void* src, uint32_t size); // NEON scalers -void scale1x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale1x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale2x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale3x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale3x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale4x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale4x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale5x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale5x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale6x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale6x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale1x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale2x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale2x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale3x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale3x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale4x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale4x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale5x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale5x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale6x_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale6x_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); + +void scale1x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x5_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x5_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x1_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x1_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x2_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x2_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x3_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x3_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x4_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x4_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x5_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x5_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x6_n16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x6_n32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); // C scalers -void scale1x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale1x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale2x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale2x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale3x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale3x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale4x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale4x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -// c16b/c32b: faster when -Ofast/-O3 and aligned width, however dp must be 4xN -void scale4x_c16b(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale4x_c32b(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale5x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale5x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale6x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); -void scale6x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale1x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale2x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale2x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale3x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale3x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale4x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale4x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale5x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale5x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale6x_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); +void scale6x_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp, uint32_t ymul); + +void scale1x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale1x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale2x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale3x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale4x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x5_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale5x5_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x1_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x1_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x2_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x2_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x3_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x3_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x4_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x4_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x5_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x5_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x6_c16(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); +void scale6x6_c32(void* __restrict src, void* __restrict dst, uint32_t sw, uint32_t sh, uint32_t sp, uint32_t dp); #endif diff --git a/src/minarch/minarch.c b/src/minarch/minarch.c index 9906320..2a60cb0 100644 --- a/src/minarch/minarch.c +++ b/src/minarch/minarch.c @@ -2568,12 +2568,12 @@ static void selectScaler_PAR(int width, int height, int pitch) { } else { switch (scale) { - case 6: renderer.scaler = scale6x_n16; break; - case 5: renderer.scaler = scale5x_n16; break; - case 4: renderer.scaler = scale4x_n16; break; - case 3: renderer.scaler = scale3x_n16; break; - case 2: renderer.scaler = scale2x_n16; break; - default: renderer.scaler = scale1x_n16; break; + case 6: renderer.scaler = scale6x6_n16; break; + case 5: renderer.scaler = scale5x5_n16; break; + case 4: renderer.scaler = scale4x4_n16; break; + case 3: renderer.scaler = scale3x3_n16; break; + case 2: renderer.scaler = scale2x2_n16; break; + default: renderer.scaler = scale1x1_n16; break; // my lesser scalers :sweat_smile: // case 4: renderer.scaler = scale4x; break; @@ -2710,12 +2710,12 @@ static void selectScaler_AR(int width, int height, int pitch) { if (has_hdmi) LOG_warn("dst offset: %i,%i (%i)\n", dx,dy, renderer.dst_offset); switch (scale) { - case 6: renderer.scaler = scale6x_n16; break; - case 5: renderer.scaler = scale5x_n16; break; - case 4: renderer.scaler = scale4x_n16; break; - case 3: renderer.scaler = scale3x_n16; break; - case 2: renderer.scaler = scale2x_n16; break; - default: renderer.scaler = scale1x_n16; break; + case 6: renderer.scaler = scale6x6_n16; break; + case 5: renderer.scaler = scale5x5_n16; break; + case 4: renderer.scaler = scale4x4_n16; break; + case 3: renderer.scaler = scale3x3_n16; break; + case 2: renderer.scaler = scale2x2_n16; break; + default: renderer.scaler = scale1x1_n16; break; } // DEBUG HUD diff --git a/todo.txt b/todo.txt index 136b394..f76200d 100644 --- a/todo.txt +++ b/todo.txt @@ -14,6 +14,10 @@ Please see the README.txt in the zip file for installation and update instructio BUG: minui.elf treats bare tag rom folders as empty + +rip out hdmi code +insert new de+ion code +test test test hardware rev volumn and menu buttons no longer work