diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/Makefile.am dosbox-0.61+hq2x/src/gui/Makefile.am --- dosbox-0.61/src/gui/Makefile.am 2004-07-05 02:44:22.000000000 +0200 +++ dosbox-0.61+hq2x/src/gui/Makefile.am 2004-07-04 23:25:07.000000000 +0200 @@ -3,5 +3,6 @@ noinst_LIBRARIES = libgui.a libgui_a_SOURCES = sdlmain.cpp sdl_mapper.cpp \ render.cpp render_scalers.cpp render_scalers.h render_templates.h \ - midi.cpp midi_win32.h midi_oss.h midi_coreaudio.h midi_alsa.h + midi.cpp midi_win32.h midi_oss.h midi_coreaudio.h midi_alsa.h \ + render_hq2x.cpp render_hq2x.h diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render.cpp dosbox-0.61+hq2x/src/gui/render.cpp --- dosbox-0.61/src/gui/render.cpp 2004-08-05 00:12:58.732847304 +0200 +++ dosbox-0.61+hq2x/src/gui/render.cpp 2004-08-04 23:50:12.000000000 +0200 @@ -33,6 +33,7 @@ #include "support.h" #include "render_scalers.h" +#include "render_hq2x.h" struct PalData { struct { @@ -190,6 +191,9 @@ } break; } + if (render.op.type == OP_Hq2x) { + Hq2x_InitLUTs((void*)render.pal.rgb,render.pal.last,render.pal.first); + } /* Setup pal index to startup values */ render.pal.first=256; render.pal.last=0; @@ -314,6 +318,7 @@ case OP_Interp2x:block=&Interp2x_8;break; case OP_AdvInterp2x:block=&AdvInterp2x_8;break; case OP_TV2x:block=&TV2x_8;break; + case OP_Hq2x:block=&Hq2x_8;break; } gfx_flags=GFX_GetBestMode(block->flags); if (!gfx_flags) { @@ -362,7 +367,7 @@ extern void GFX_SetTitle(Bits cycles, Bits frameskip,bool paused); static void IncreaseFrameSkip(void) { - if (render.frameskip.max<10) render.frameskip.max++; + if (render.frameskip.max<25) render.frameskip.max++; LOG_MSG("Frame Skip at %d",render.frameskip.max); GFX_SetTitle(-1,render.frameskip.max,false); } @@ -376,6 +381,12 @@ void RENDER_Init(Section * sec) { Section_prop * section=static_cast(sec); + Hq2x_colourTrigger=section->Get_int("hq2x_threshold"); + if (Hq2x_colourTrigger > 255) Hq2x_colourTrigger = 255; + if (Hq2x_colourTrigger < 0) Hq2x_colourTrigger = 0; + Hq2x_colourTrigger_adaptive=section->Get_int("hq2x_threshold_adaptive"); + if (Hq2x_colourTrigger_adaptive > 255) Hq2x_colourTrigger_adaptive = 255; + if (Hq2x_colourTrigger_adaptive <= 0) Hq2x_colourTrigger_adaptive = 75; render.pal.first=256; render.pal.last=0; render.aspect=section->Get_bool("aspect"); @@ -398,12 +409,17 @@ else if (!strcasecmp(scaler,"advinterp2x")) render.op.want_type=OP_AdvInterp2x; else if (!strcasecmp(scaler,"interp2x")) render.op.want_type=OP_Interp2x; else if (!strcasecmp(scaler,"tv2x")) render.op.want_type=OP_TV2x; + else if (!strcasecmp(scaler,"hq2x")) render.op.want_type=OP_Hq2x; else { render.op.want_type=OP_Normal; LOG_MSG("Illegal scaler type %s,falling back to normal.",scaler); } MAPPER_AddHandler(DecreaseFrameSkip,MK_f7,MMOD1,"decfskip","Dec Fskip"); MAPPER_AddHandler(IncreaseFrameSkip,MK_f8,MMOD1,"incfskip","Inc Fskip"); + MAPPER_AddHandler(Hq2x_DecreaseThreshold,MK_f3,MMOD1|MMOD2,"dechq2xthreshold","Dec Hq2x Static Threshold"); + MAPPER_AddHandler(Hq2x_IncreaseThreshold,MK_f4,MMOD1|MMOD2,"inchq2xthreshold","Inc Hq2x Static Threshold"); + MAPPER_AddHandler(Hq2x_DecreaseThresholdAdaptive,MK_f5,MMOD1|MMOD2,"dechq2xadapthreshold","Dec Hq2x Adaptive Threshold"); + MAPPER_AddHandler(Hq2x_IncreaseThresholdAdaptive,MK_f6,MMOD1|MMOD2,"inchq2xadapthreshold","Inc Hq2x Adaptive Threshold"); GFX_SetTitle(-1,render.frameskip.max,false); } diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x.cpp dosbox-0.61+hq2x/src/gui/render_hq2x.cpp --- dosbox-0.61/src/gui/render_hq2x.cpp 1970-01-01 01:00:00.000000000 +0100 +++ dosbox-0.61+hq2x/src/gui/render_hq2x.cpp 2004-08-04 23:43:53.000000000 +0200 @@ -0,0 +1,799 @@ +//hq2x filter demo program +//---------------------------------------------------------- +//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com ) +// Speed optimization and mmx code Copyright (c) 2004 Jörg Walter (jwalt@garni.ch) + +//This program is free software; you can redistribute it and/or +//modify it under the terms of the GNU Lesser General Public +//License as published by the Free Software Foundation; either +//version 2.1 of the License, or (at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +//Lesser General Public License for more details. +// +//You should have received a copy of the GNU Lesser General Public +//License along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +/* + This code comes in three variants: + 1. plain C code with live difference calculation + 2. C code with live difference calculation in MMX + 3. lookup-table based difference calculation + + Which one is fastest depends on your CPU speed and cache size. The table based algorithm + should be fastest if you have 32kb L1 data cache or more. Packing diff values into less + bytes is possible, define DIFF_TABLE to the number of bits per int. + + Speed: 22fps/27fps(MMX)/32fps(table) on a pentium2/333MHz + + TODO: + Currently only does 32bpp/16bpp BGRA output, and (theoretically) RGB output. YUV + isn't needed anymore, it seems, so this code should now work in all setups. + MMX code only does RGB, thus isn't really usable (but useful for benchmarking). + This code should use the intel compiler functions for mmx, as GCC emulates + them more or less completely. + + further optimization ideas: + - fix gcc bugs (shift), so Diff_mmx can run without register spilling + - manual unrolling of Diff loop to get decent memory prefetch for + recent CPUs + - add mmxext support to Diff for faster unpacking + - test if sse's movntq in interpolation loop improves things + - find a way to mmxify the interpolation loop sensibly + (currently runs slower than non-mmx code) + - find a way to save (cache-)memory in the factors table + (tighter packing and double indirection are both slower on p2) + - find a way for 16bpp not to suck that hard (speed-wise) +*/ + + +#include +#include +#include +#include +#include + +#include "config.h" +#include "dosbox.h" +#include "video.h" +#include "render_scalers.h" +#include "render_hq2x.h" + +#ifndef __GCC__ +#define __attribute__(x) +#define __builtin_expect(x,y) x +#endif + +#define ADAPTIVE +/* #define DEBUG */ +// #define DIFF_TABLE 32 +#define DIFF_TABLE 1 + +// Gathered experimentally, values from 0x08-0x80 are useful, depending on graphics +// and your personal preference. +long Hq2x_colourTrigger = 0; +long Hq2x_colourTrigger_adaptive = 75; + +#ifdef ADAPTIVE +#undef DIFF_TABLE +#define DIFF_TABLE 1 +#endif + +#ifdef DIFF_TABLE +#define bits DIFF_TABLE +static +#if DIFF_TABLE == 1 +unsigned char +#else +int +#endif +difftable[65536/bits]; + +inline static unsigned int Diff1_calc(int r1, int g1, int b1, int r2, int g2, int b2) +{ + long r,g,b; + long rmean; + long ret; + + rmean = r1+r2; + b = b1-b2; + g = g1-g2; + r = r1-r2; + +#ifdef ADAPTIVE + ret = (unsigned int)(((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g); + return (ret < Hq2x_colourTrigger?0:ret-Hq2x_colourTrigger > 255?255:ret-Hq2x_colourTrigger); +#else + return ((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g > Hq2x_colourTrigger; +#endif +} + +#if DIFF_TABLE == 1 +#define Diff1(x,y) (difftable[(*(x)) * (256/bits) + ((*(y)) / bits)] >> ((*(y))%bits)) +#else +#define Diff1(x,y) ((difftable[(*(x)) * (256/bits) + ((*(y)) / bits)] >> ((*(y))%bits)) & 1) +#endif +inline static int Diff(const unsigned char *l2, const unsigned char *l3) +{ +#ifdef ADAPTIVE + unsigned int max = Diff1(l2,l3+1); + unsigned int min = max; + unsigned int dynthres = Diff1(l2+1,l3); + if (dynthres > max) max = dynthres; + if (dynthres < min) min = dynthres; + dynthres = Diff1(l2+1,l3+1); + if (dynthres > max) max = dynthres; + if (dynthres < min) min = dynthres; + dynthres = Diff1(l3,l3+1); + if (dynthres > max) max = dynthres; + if (dynthres < min) min = dynthres; + dynthres = (Hq2x_colourTrigger_adaptive*max+(100-Hq2x_colourTrigger_adaptive)*min)/200; + + return ((Diff1(l2,l3+1)>dynthres)*0x00aa0055) | ((Diff1(l2+1,l3)>dynthres)*0x005500aa) | ((Diff1(l2+1,l3+1)>dynthres)*0x03000300) | ((Diff1(l3,l3+1)>dynthres)*0x0c000c00); +#else + return (Diff1(l2,l3+1)*0x00aa0055) | (Diff1(l2+1,l3)*0x005500aa) | (Diff1(l2+1,l3+1)*0x03000300) | (Diff1(l3,l3+1)*0x0c000c00); +#endif +} +#undef __MMX__ +#else +#ifdef __MMX__ +/* always on for gcc for now */ +#define MMX_ONLY +/* this is safe for -march=..., but not if someone specifies -mmmx manually */ + +# ifdef MMX_ONLY +# define Diff_mmx Diff +# else + int has_mmx = 0; +# define Diff(a,b) (has_mmx?Diff_mmx((a),(b)):Diff_any((a),(b))) +# endif +#else +# define Diff_any Diff +#endif + +// A better colour distance function, adapted from http://www.compuphase.com/cmetric.htm +#if !defined(MMX_ONLY) || defined(DEBUG) + +inline static int Diff1(const unsigned char *e1, const unsigned char* e2) +{ + long r,g,b; + long rmean; + + rmean = e1[0]+e2[0]; + b = e1[0]-e2[0]; + g = e1[1]-e2[1]; + r = e1[2]-e2[2]; + + return ((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g > Hq2x_colourTrigger; +} + +inline static int Diff_any(const unsigned long *l2, const unsigned long *l3) +{ + return (Diff1((unsigned char *)l2,(unsigned char *)(l3+1))*0x00aa0055) | (Diff1((unsigned char *)(l2+1),(unsigned char *)l3)*0x005500aa) | (Diff1((unsigned char *)(l2+1),(unsigned char *)(l3+1))*0x03000300) | (Diff1((unsigned char *)l3,(unsigned char *)(l3+1))*0x0c000c00); +} +#endif + +#ifdef __MMX__ +typedef int mmx_1_64 __attribute__((mode(DI))); +typedef int mmx_2_32 __attribute__((mode(V2SI))); +typedef int mmx_4_16 __attribute__((mode(V4HI))); +typedef int mmx_8_8 __attribute__((mode(V8QI))); + +static mmx_4_16 mmx_trigger; + +/* Note: this needs BGRA pixel layout, with the A component replaced by (-R)+32 */ +inline static int Diff_mmx(const unsigned long *e1, const unsigned long *e2) +{ + mmx_4_16 mm0, mm1, mm2, mm3, mm4; +#ifdef DEBUG + mmx_4_16 t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17,t18, m0, m1; + #define d(x) x = +#else +#define d(x) +#endif + const mmx_8_8 zero = (mmx_8_8)0x0ULL; + const mmx_4_16 rmean_off = (mmx_4_16)(0x0a000a000a000a00ULL); + const mmx_4_16 factors = (mmx_4_16)0xfffdfff4aa5655abULL; + + /* -1 * aa56 = 0101 0101 1010 1010 */ + /* -1 * 55ab = 1010 1010 0101 0101 */ + + /* + Read from memory: + */ + mm1 = *(mmx_4_16 *)e2; + mm0 = *(mmx_4_16 *)e1; + + /* (high ................................ low) + -p2r+32, p2b, p2g, p2r, -p1r+32, p1b, p1g, p1r = mm0 + -p5r+32, p5b, p5g, p5r, -p4r+32, p4b, p4g, p4r = mm1 + + Shuffle dwords so we get 4 registers with pixel + arrangement ready for difference calculation: + + (2, 5, 4, 1) - (5, 4, 2, 5) + + We choose (rrrr, gggg) + (bbbb, rrrr) layout. This + is quite expensive, given that difference calculation + in (rgbr, rgbr) form would need just two unpacks, but + the unpacking has to be done sooner or later, and + this pixel layout makes later calculations cheaper. + TODO: sse/mmxext version of this unpacking should be + much cheaper. + + -p4r+32, -p1r+32, p4b, p1b, p4g, p1g, p4r, p1r = mm0 + -p5r+32, p5b, p5g, p5r, -p5r+32, p5b, p5g, p5r = mm3 (temp) + -p2r+32, -p5r+32, p2b, p5b, p2g, p5g, p2r, p5r = mm2 + -p5r+32, -p4r+32, p5b, p4b, p5g, p4g, p5r, p4r = mm1 +*/ + + mm3 = (mmx_4_16)__builtin_ia32_punpckhdq((mmx_2_32)mm1,(mmx_2_32)mm1); + mm2 = (mmx_4_16)__builtin_ia32_punpckhbw((mmx_8_8)mm1,(mmx_8_8)mm0); + mm0 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm0,(mmx_8_8)mm1); + mm1 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm1,(mmx_8_8)mm3); + +/* + ... continued ... + + -p2r+32, -p5r+32, -p4r+32, -p1r+32, p2b, p5b, p4b, p1b = mm0 + p2g, p5g, p4g, p1g, p2r, p5r, p4r, p1r = mm3 + + -p5r+32, -p4r+32, -p2r+32, -p5r+32, p5b, p4b, p2b, p5b = mm4 + p5g, p4g, p2g, p5g, p5r, p4r, p2r, p5r = mm2 + +*/ + + mm3 = (mmx_4_16)__builtin_ia32_punpcklwd(mm0,mm2); + mm0 = (mmx_4_16)__builtin_ia32_punpckhwd(mm0,mm2); + mm4 = (mmx_4_16)__builtin_ia32_punpckhwd(mm2,mm1); + mm2 = (mmx_4_16)__builtin_ia32_punpcklwd(mm2,mm1); + + /* + Put mm2 with negated red component into mm1. Negation is done + in the lookup table. + + -p2r+32, -p5r+32, -p4r+32, -p1r+32, p2b, p5b, p4b, p1b = mm0 + p2g, p5g, p4g, p1g, p2r, p5r, p4r, p1r = mm3 + + -p5r+32, -p4r+32, -p2r+32, -p5r+32, p5b, p4b, p2b, p5b = mm4 + p5g, p4g, p2g, p5g, -p5r+32, -p4r+32, -p2r+32, -p5r+32 = mm2 + + */ + + mm2 = (mmx_4_16)__builtin_ia32_punpckhdq((mmx_2_32)mm4,(mmx_2_32)mm2); + + /* + + Calculate the differences (and rmean) + mm0-mm4, mm3-mm2 (signed saturation) + + d1r/8, d3r/8, d8r/8, d0r/8, d1b/8, d3b/8, d8b/8, d0b/8 = mm0 + d1g/8, d3g/8, d8g/8, d0g/8, d1rmean/4-32, d3rmean/4-32, d8rmean/4-32, d0rmean/4-32 = mm3 + + */ + + mm0 = (mmx_4_16)__builtin_ia32_psubsb((mmx_8_8)mm0,(mmx_8_8)mm4); + mm3 = (mmx_4_16)__builtin_ia32_psubsb((mmx_8_8)mm3,(mmx_8_8)mm2); + +#ifdef DEBUG +{ + int p1 = e1[0]; + int p5 = e2[1]; + char *cmm0 = (void*)&mm0, *cmm3 = (void*)&mm3; + if (((int)cmm3[0]) != ((int)(p1&Rmask) + (int)(p5&Rmask) - 32)) abort(); + if (((int)cmm0[4]) != -((int)(p1&Rmask) - (int)(p5&Rmask))) abort(); +} +#endif + + /* Intermediate stats: + + (rough) code equivalent: + rmean = (((int)(e1&Rmask) + (int)(e2&Rmask)) >> 16) - 32; + r = ((int)(e2&Rmask) - (int)(e1&Rmask)) >> 16; + g = ((int)(e1&Gmask) - (int)(e2&Gmask)) >> 8; + b = ((int)(e1&Bmask) - (int)(e2&Bmask)); + + Gain: + 1 distance w/o mmx = 16 ops + 4 distances w/ mmx = 11 ops + (possible parallelism left to the compiler) + + Todo: + ((160+rmean)*r/8*r/8) + 256*g/8*g/8 + ((160-rmean)*b/8*b/8) + + (slightly incorrect: the result is the true difference plus (b/8)^2, but + this eliminates a constant, making the algorithm fit into the available + 8 registers) + + d1r/8, d3r/8, d8r/8, d0r/8, d1b/8, d3b/8, d8b/8, d0b/8 = mm0 + d1g/8, d3g/8, d8g/8, d0g/8, d1rmean/4-32, d3rmean/4-32, d8rmean/4-32, d0rmean/4-32 = mm3 + */ + + /* + prepare differences for final calculation: + + 00 d1r/2 00 d3r/2 00 d8r/2 00 d0r/2 = mm0 + 00 d1b/2 00 d3b/2 00 d8b/2 00 d0b/2 = mm1 + 00 d1g/8 00 d3g/8 00 d8g/8 00 d0g/8 = mm2 + 00 (d1rmean/4-32)*16 00 (d3rmean/4-32)*16 00 (d8rmean/4-32)*16 00 (d0rmean/4-32)*16 = mm3 + + */ + +#ifdef DEBUG + m0 = mm0; + m1 = mm3; +#endif + // TODO: compiler error at __builtin_ia32_psllb(mm0,2); + d(t0) mm0 = __builtin_ia32_pmullw(mm0,(mmx_4_16)(0x0004000400040004ULL)); + d(t1) mm1 = (mmx_4_16)__builtin_ia32_punpcklbw(zero, (mmx_8_8)mm0); + d(t2) mm0 = (mmx_4_16)__builtin_ia32_punpckhbw(zero, (mmx_8_8)mm0); + d(t3) mm2 = (mmx_4_16)__builtin_ia32_punpckhbw(zero, (mmx_8_8)mm3); + d(t4) mm3 = (mmx_4_16)__builtin_ia32_punpcklbw(zero, (mmx_8_8)mm3); + // TODO: compiler error at __builtin_ia32_psraw(mm3,4); + d(t5) mm3 = __builtin_ia32_pmulhw(mm3,(mmx_4_16)(0x1000100010001000ULL)); + + /* + intermediate results: squares and rmean factors + + 00 (d1r/2)^2 00 (d3r/2)^2 00 (d8r/2)^2 00 (d0r/2)^2 = mm0 + 00 (d1b/2)^2 00 (d3b/2)^2 00 (d8b/2)^2 00 (d0b/2)^2 = mm1 + 00 256*(d1g/8)^2 00 256*(d3g/8)^2 00 256*(d8g/8)^2 00 256*(d0g/8)^2 = mm2 + 00 128+d1rmean/4 00 128+d3rmean/4 00 128+d8rmean/4 00 128+d0rmean/4 = mm3 + 00 192-d1rmean/4 00 192-d3rmean/4 00 192-d8rmean/4 00 192-d0rmean/4 = mm4 + + */ + + d(t9) mm0 = __builtin_ia32_pmulhw(mm0,mm0); + d(t10) mm1 = __builtin_ia32_pmulhw(mm1,mm1); + d(t11) mm2 = __builtin_ia32_pmulhw(mm2,mm2); + d(t12) mm4 = __builtin_ia32_psubsw(rmean_off,mm3); + d(t13) mm3 = __builtin_ia32_paddsw(mm3,rmean_off); + + /* + intermediate results: finish red and blue components + + 00 (128+d1rmean/4)*(d1r/8)^2 00 (128+d3rmean/4)*(d3r/8)^2 00 (128+d8rmean/4)*(d8r/8)^2 00 (128+d0rmean/4)*(d0r/8)^2 = mm0 + 00 (192-d1rmean/4)*(d1b/8)^2 00 (192-d3rmean/4)*(d3b/8)^2 00 (192-d8rmean/4)*(d8b/8)^2 00 (192-d0rmean/4)*(d0b/8)^2 = mm1 + 00 (d1g/8)^2 00 (d3g/8)^2 00 (d8g/8)^2 00 (d0g/8)^2 = mm2 + + */ + + d(t15) mm1 = __builtin_ia32_pmulhw(mm1, mm4); + d(t16) mm0 = __builtin_ia32_pmulhw(mm0, mm3); + + /* + calculate final visual difference + + (128+rmean/4)*(r/8)^2+(192-rmean/4)*(b/8)^2+256*(g/8)^2 = mm0 (order: 1 3 8 0) + */ + + d(t17) mm0 = __builtin_ia32_paddw(mm0,mm1); + d(t18) mm0 = __builtin_ia32_paddw(mm0,mm2); + +#ifdef DEBUG +{ + int p1 = e1[0]; + int p5 = e2[1]; + short *smm0 = (void*)&mm0; + long r,g,b; + long rmean, diff; + rmean = (((p1+p5)&Rmask)-32); + r = (p1&Rmask)-(p5&Rmask); + g = ((p1&Gmask) - (p5&Gmask)) >> 8; + b = ((p1&Bmask) - (p5&Bmask)) >> 16; + + diff = ((160+rmean)*r*r + 256*g*g + (160-rmean)*b*b)/256; + if (diff > smm0[0]+1 || diff < smm0[0]-1) abort(); +} +#endif + + /* + Code equivalent: + ((((512+rmean)>>8)*r*r) + 4*g*g + (((768-rmean)>>8)*b*b)) + + test against threshold + + (diff1?0xffff:0x0000) (diff3?0xffff:0x0000) (diff8?0xffff:0x0000) (diff0?0x0xffff:0x0000) = mm0 + */ + + mm0 = __builtin_ia32_pcmpgtw(mm0,mmx_trigger); + + /* + create final bit patterns + + 0000 0000 (diff1*0x03000300)|(diff3*0x0c000c00)|(diff8*0x005500aa)|(diff0*0x00aa0055) + + */ + + mm0 = (mmx_4_16)__builtin_ia32_pmaddwd(mm0,factors); + mm0 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm0,(mmx_8_8)__builtin_ia32_punpckhbw((mmx_8_8)mm0,(mmx_8_8)mm0)); + + return (unsigned long)(unsigned long long)mm0; + /* + Total: 11+16+3 = 30 ops for 4 distances vs. 16+13+7 = 36 ops for 1 distance + */ +} +#endif +#endif + +static int LUTPAL8to32[256] __attribute__((aligned(32))); +//#define factors(a,b,c,d) (((a)-1) | ((b)<<2) | ((c)<<4) | ((d)<<6)) +#define P0 {8,0,0,0} +#define P10 {6,2,0,0} +#define P11 {6,0,0,2} +#define P12 {6,0,2,0} +#define P20 {4,0,2,2} +#define P21 {4,2,2,0} +#define P22 {4,2,0,2} +#define P60 {5,0,2,1} +#define P61 {5,0,1,2} +#define P70 {6,0,1,1} +#define P90 {2,0,3,3} +#define P100 {7,0,0,1} +#define X {0,0,0,0} +#define UNUSED X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, \ + X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X + +/* sparse table: only 2k entries are used */ +static unsigned char factors[4096][4] __attribute__((aligned(32))) = { +/* 0000 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0040 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0080 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 00c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0100 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0140 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0180 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 01c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0200 */ UNUSED, +/* 0400 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0440 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0480 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 04c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0500 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0540 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0580 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 05c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0600 */ UNUSED, +/* 0800 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0840 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0880 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 08c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0900 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0940 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0980 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 09c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 0a00 */ UNUSED, +/* 0c00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0c40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0c80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0cc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0d00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0d40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0d80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0dc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 0e00 */ UNUSED, +/* 1000 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1040 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1080 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 10c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1100 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1140 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1180 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 11c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1200 */ UNUSED, +/* 1400 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0, +/* 1440 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0, +/* 1480 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0, +/* 14c0 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0, +/* 1500 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0, +/* 1540 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0, +/* 1580 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0, +/* 15c0 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0, +/* 1600 */ UNUSED, +/* 1800 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12, +/* 1840 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12, +/* 1880 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12, +/* 18c0 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12, +/* 1900 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1940 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1980 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 19c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 1a00 */ UNUSED, +/* 1c00 */ P20, P20, P0, P0, P20, P20, P0, P0, P10, P20, P10, P0, P10, P20, P10, P0, +/* 1c40 */ P20, P20, P0, P0, P20, P20, P0, P0, P10, P20, P10, P0, P10, P20, P10, P0, +/* 1c80 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 1cc0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 1d00 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, +/* 1d40 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, +/* 1d80 */ P70, P90, P10, P0, P70, P90, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 1dc0 */ P70, P90, P10, P0, P70, P90, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 1e00 */ UNUSED, +/* 2000 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2040 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2080 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 20c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2100 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2140 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2180 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 21c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2200 */ UNUSED, +/* 2400 */ P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, /* above */ +/* 2440 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2480 */ P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, +/* 24c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2500 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2540 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2580 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 25c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2600 */ UNUSED, +/* 2800 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2840 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2880 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 28c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2900 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2940 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2980 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 29c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, +/* 2a00 */ UNUSED, +/* 2c00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, /* rot */ +/* 2c40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2c80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P60, P22, P60, P22, P60, P22, P60, +/* 2cc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2d00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2d40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2d80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2dc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, +/* 2e00 */ UNUSED, +/* 3000 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3040 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3080 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 30c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3100 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3140 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3180 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 31c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3200 */ UNUSED, +/* 3400 */ P20, P20, P0, P0, P20, P20, P0, P0, P70, P20, P10, P0, P70, P20, P10, P0, +/* 3440 */ P20, P20, P0, P0, P20, P20, P0, P0, P70, P20, P10, P0, P70, P20, P10, P0, +/* 3480 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 34c0 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 3500 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P90, P10, P0, P70, P90, P10, P0, +/* 3540 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P90, P10, P0, P70, P90, P10, P0, +/* 3580 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 35c0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 3600 */ UNUSED, +/* 3800 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3840 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3880 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12, +/* 38c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12, +/* 3900 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3940 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3980 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 39c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, +/* 3a00 */ UNUSED, +/* 3c00 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, +/* 3c40 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, +/* 3c80 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 3cc0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 3d00 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P20, P10, P0, P10, P20, P10, P0, +/* 3d40 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P20, P10, P0, P10, P20, P10, P0, +/* 3d80 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 3dc0 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0, +/* 3e00 */ UNUSED +}; + +/* Memory usage at 320 pixels width: + + ~4k line buffer + ~2.5k pattern buffer + 8k factor table (can be packed into as less as 2k, but unpacking takes more time than we lose due + to cache trashing, at least on a pentium2) + 1k palette table +------ +~15.5k data + +If the diff table is used, another 8k (packed) / 64k (unpacked) are used. +*/ + +/* Optimizations: + +Pixel/Pattern layout: + +1 2 3 + a c +4 5 6 + b d +7 8 9 + +Factor storage: +Pixel: 5 1 2 4 + + 0: 8 0 0 0 + 10: 6 2 0 0 + 11: 6 0 0 2 + 12: 6 0 2 0 + 20: 4 0 2 2 + 21: 4 2 2 0 + 22: 4 2 0 2 + 60: 5 0 2 1 + 61: 5 0 1 2 + 70: 6 0 1 1 + 90: 2 0 3 3 +100: 7 0 1 0 + +bits: 3 1 2 2 = 8 bit + +Factor set 100 is wrong: 7 0 0.5 0.5 would be 100% like the original, but +since pixel 2 and 4 are visually close for all patterns where this set is +used, this important simplification should not be visible. + +Pattern usage: + +a b c d +0 8 1 3 10 5 6 (a) 9 2 (b) 4 7 (11) (c) (d) -> 1 2 4 5 + +0 8 10 5 9 2 7 x 1 6 3 4 + +cr+a2 dr+b2 ar br +2 9 1 4 11 7 6 (d) 8 0 (x) 3 5 (10) (y) (a) -> 3 2 6 5 + +br+a3 ar dr+c3 cr +5 10 6 3 8 0 1 (m) 11 7 (c) 4 2 (9) (b) (n) -> 7 8 4 5 + +d+b2+c3 c+a2 b+a3 a +7 11 6 4 9 2 1 (n) 10 5 (y) 3 0 (8) (x) (m) -> 9 8 6 5 + + +Pattern storage: + +0123 = \/|_ + +0101 010x 2323 +1010 101x 2323 + +*/ + +union pattern { + unsigned short p[2]; + unsigned long value; +}; + +static unsigned long lines0[Hq2x_MAXWIDTH+2] __attribute__((aligned(32))); +static unsigned long lines1[Hq2x_MAXWIDTH+2] __attribute__((aligned(32))); +static unsigned long lines2[Hq2x_MAXWIDTH+2] __attribute__((aligned(32))); +static unsigned long *l1, *l2, *l3, *tmp; +static union pattern p0[Hq2x_MAXWIDTH+2] __attribute__((aligned(32))); +static union pattern p1[Hq2x_MAXWIDTH+2] __attribute__((aligned(32))); +static union pattern *top, *bot, *ptmp; +static unsigned char prev[Hq2x_MAXWIDTH+2]; + +#ifdef DIFF_TABLE +#define diffcall bot[i].value = Diff(prev+i,pIn+i); memcpy(prev,pIn,Scaler_SrcWidth+1); +#else +#define diffcall bot[i].value = Diff(l2+i,l3+i); +#endif + +#ifndef __MMX__ +#define __builtin_ia32_emms() +#endif + +#define CONSTCHECK if (Scaler_SrcWidth == 320) RENDER_DrawLine = + +#define store(out,index,x,y) do{((unsigned long*)out)[index*2] = 0xff000000|(x); ((unsigned long*)out)[index*2+1] = 0xff000000|(y);}while(0) +#define type long + +#define FUNC Hq2x_long_320_line +#define Scaler_SrcWidth 320 +#define CHECK_CONST +#include "render_hq2x_template.h" +#undef CHECK_CONST +#undef Scaler_SrcWidth +#undef FUNC + +#define FUNC Hq2x_long_Scaler_SrcWidth_line +#define CHECK_CONST CONSTCHECK Hq2x_long_320_line; +#include "render_hq2x_template.h" +#undef store +#undef type +#undef CHECK_CONST +#undef FUNC + +/* 16 bit support */ +#ifdef WORDS_BIGENDIAN +#define store(out,index,y,x) ((unsigned long *)out)[index] = (((((x)>>3)&0x1f)|(((y)<<13))&0x1f0000)|((((x)>>5)&0x7e0)|(((y)<<11))&0x7e00000)|((((x)>>8)&0xf800)|(((y)<<8))&0xf8000000)) +#else +#define store(out,index,x,y) ((unsigned long *)out)[index] = (((((x)>>3)&0x1f)|(((y)<<13))&0x1f0000)|((((x)>>5)&0x7e0)|(((y)<<11))&0x7e00000)|((((x)>>8)&0xf800)|(((y)<<8))&0xf8000000)) +#endif +#define type short + +#define FUNC Hq2x_short_320_line +#define Scaler_SrcWidth 320 +#define CHECK_CONST +#include "render_hq2x_template.h" +#undef CHECK_CONST +#undef Scaler_SrcWidth +#undef FUNC + +#define FUNC Hq2x_short_Scaler_SrcWidth_line +#define CHECK_CONST CONSTCHECK Hq2x_short_320_line; +#include "render_hq2x_template.h" +#undef store +#undef type +#undef CHECK_CONST +#undef FUNC + +ScalerBlock Hq2x_8={ + CAN_16|CAN_32|LOVE_32|NEED_RGB, + 2,2,1, + 0,Hq2x_short_Scaler_SrcWidth_line,Hq2x_short_Scaler_SrcWidth_line,Hq2x_long_Scaler_SrcWidth_line +}; + +void Hq2x_InitLUTs(const void *pal, int palette_end, int palette_start) +{ + int i, j; + struct GFX_PalEntry *palette = (struct GFX_PalEntry *)pal; + + // All componets are reduced to 5 bit (VGA palette has 6 bit) + // for simpler multiplication and storage (divided by 8) + for (i=palette_start; i<=palette_end; i++) { + // 5 significant bits with 3 bit multiplier fit into 8 bit, thus + // plain int multiplication can be used without tricks + // R is duplicated into A, negated and increased by 32 for some + // nice mmx distance calculation tricks + LUTPAL8to32[i] = ((palette[i].r&0xf8) << 13) | ((palette[i].g&0xf8) << 5) | ((palette[i].b&0xf8) >> 3) | ((32*8-(palette[i].r&0xf8)) << 21); + } + +#ifdef DIFF_TABLE +#if DIFF_TABLE != 1 + memset(difftable,0,sizeof(difftable)); +#endif + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + difftable[(i) * (256/bits) + ((j) / bits)] +#if DIFF_TABLE == 1 + = +#else + |= +#endif + Diff1_calc((LUTPAL8to32[i]>>16)&0x1f,(LUTPAL8to32[i]>>8)&0x1f,(LUTPAL8to32[i])&0x1f, (LUTPAL8to32[j]>>16)&0x1f,(LUTPAL8to32[j]>>8)&0x1f,(LUTPAL8to32[j])&0x1f) << (j%bits); + } + } +#endif + +#ifdef __MMX__ + *((short *)(&mmx_trigger)) = Hq2x_colourTrigger; + *(((short *)(&mmx_trigger))+1) = Hq2x_colourTrigger; + *(((short *)(&mmx_trigger))+2) = Hq2x_colourTrigger; + *(((short *)(&mmx_trigger))+3) = Hq2x_colourTrigger; +#endif +} + +void Hq2x_IncreaseThreshold(void) +{ + if (Hq2x_colourTrigger < 255) Hq2x_colourTrigger++; + Hq2x_InitLUTs(0,0,1); + LOG_MSG("Hq2x threshold at %i",Hq2x_colourTrigger); +} + +void Hq2x_DecreaseThreshold(void) +{ + if (Hq2x_colourTrigger > 0) Hq2x_colourTrigger--; + Hq2x_InitLUTs(0,0,1); + LOG_MSG("Hq2x threshold at %i",Hq2x_colourTrigger); +} + +void Hq2x_IncreaseThresholdAdaptive(void) +{ + if (Hq2x_colourTrigger_adaptive < 100) Hq2x_colourTrigger_adaptive++; + LOG_MSG("Hq2x adaptive threshold at %i",Hq2x_colourTrigger_adaptive); +} + +void Hq2x_DecreaseThresholdAdaptive(void) +{ + if (Hq2x_colourTrigger_adaptive > 0) Hq2x_colourTrigger_adaptive--; + LOG_MSG("Hq2x adaptive threshold at %i",Hq2x_colourTrigger_adaptive); +} + diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x.h dosbox-0.61+hq2x/src/gui/render_hq2x.h --- dosbox-0.61/src/gui/render_hq2x.h 1970-01-01 01:00:00.000000000 +0100 +++ dosbox-0.61+hq2x/src/gui/render_hq2x.h 2004-08-02 20:34:46.000000000 +0200 @@ -0,0 +1,31 @@ +//derived from the hq2x filter demo program +//---------------------------------------------------------- +//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com ) +// Speed optimization and mmx code Copyright (c) 2004 Jörg Walter (jwalt@garni.ch) + +//This program is free software; you can redistribute it and/or +//modify it under the terms of the GNU Lesser General Public +//License as published by the Free Software Foundation; either +//version 2.1 of the License, or (at your option) any later version. +// +//This program is distributed in the hope that it will be useful, +//but WITHOUT ANY WARRANTY; without even the implied warranty of +//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +//Lesser General Public License for more details. +// +//You should have received a copy of the GNU Lesser General Public +//License along with this program; if not, write to the Free Software +//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#ifndef __HQ2X_H +#define __HQ2X_H + +#define Hq2x_MAXWIDTH 640-2 +extern long Hq2x_colourTrigger; +extern long Hq2x_colourTrigger_adaptive; +extern void Hq2x_InitLUTs(const void *palette, int palette_end, int palette_start); +extern void Hq2x_IncreaseThreshold(void); +extern void Hq2x_DecreaseThreshold(void); +extern void Hq2x_IncreaseThresholdAdaptive(void); +extern void Hq2x_DecreaseThresholdAdaptive(void); +#endif diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x_template.h dosbox-0.61+hq2x/src/gui/render_hq2x_template.h --- dosbox-0.61/src/gui/render_hq2x_template.h 1970-01-01 01:00:00.000000000 +0100 +++ dosbox-0.61+hq2x/src/gui/render_hq2x_template.h 2004-08-02 17:27:01.000000000 +0200 @@ -0,0 +1,78 @@ +static void FUNC(const unsigned char *pIn) +{ + int i, j; + unsigned int factor, value1, value2, linesa = (*Scaler_Index++)+1, linesb = linesa/2; + linesa -= linesb; + + pIn--; + if (__builtin_expect(Scaler_Line++==0,0)) { + int i; + CHECK_CONST + + Scaler_DstWrite -= 2*sizeof(type); + l1 = lines0; + l2 = lines1; + l3 = lines2; + for (i=0; i <= Scaler_SrcWidth+1; i++) l2[i] = 0x20000000; + l3[0] = 0x20000000; + l3[1] = LUTPAL8to32[pIn[1]]; + + for (i=2; i<=Scaler_SrcWidth+1; i++) + l3[i] = LUTPAL8to32[pIn[i]]; + + top = p0; + bot = p1; + memcpy(prev,pIn,Scaler_SrcWidth+1); + for (i=1; i <= Scaler_SrcWidth; i++) diffcall + return; + } + + tmp = l1; l1 = l2; l2 = l3; l3 = tmp; + ptmp = top; top = bot; bot = ptmp; + bot[0].value = 0x07ff07ff; + + l3[0] = 0x20000000; + l3[1] = LUTPAL8to32[pIn[1]]; + + for (i=2; i<=Scaler_SrcWidth+1; i++) + l3[i] = LUTPAL8to32[pIn[i]]; + + for (i=1; i<=Scaler_SrcWidth; i++) diffcall + + if (linesa > 0) { + for (i=1; i<=Scaler_SrcWidth; i++) { + factor = (top[i-1].p[0]&0x503)|(bot[i-1].p[0]&0x20c)|(top[i].p[0]&0x830)|(bot[i].p[0]&0x040); + value1 = (l1[i-1]*factors[factor][1]+l1[i]*factors[factor][2]+l2[i-1]*factors[factor][3]+l2[i]*factors[factor][0]); + + factor = (top[i-1].p[1]&0x930)|(bot[i-1].p[1]&0x240)|(top[i].p[1]&0x403)|(bot[i].p[1]&0x00c); + value2 = (l1[i+1]*factors[factor][1]+l1[i]*factors[factor][2]+l2[i+1]*factors[factor][3]+l2[i]*factors[factor][0]); + store(Scaler_DstWrite,i,value1,value2); + } + while (--linesa) { + memcpy(Scaler_DstWrite+Scaler_DstPitch,Scaler_DstWrite,Scaler_DstPitch); + Scaler_DstWrite += Scaler_DstPitch; + } + Scaler_DstWrite += Scaler_DstPitch; + } + + if (linesb > 0) { + for (i=1; i <= Scaler_SrcWidth; i++) { + factor = (top[i-1].p[1]&0x60c)|(bot[i-1].p[1]&0x103)|(top[i].p[1]&0x840)|(bot[i].p[1]&0x030); + value1 = (l3[i-1]*factors[factor][1]+l3[i]*factors[factor][2]+l2[i-1]*factors[factor][3]+l2[i]*factors[factor][0]); + + factor = (top[i-1].p[0]&0xa40)|(bot[i-1].p[0]&0x130)|(top[i].p[0]&0x40c)|(bot[i].p[0]&0x003); + value2 = (l3[i+1]*factors[factor][1]+l3[i]*factors[factor][2]+l2[i+1]*factors[factor][3]+l2[i]*factors[factor][0]); + store(Scaler_DstWrite,i,value1,value2); + } + while (--linesb) { + memcpy(Scaler_DstWrite+Scaler_DstPitch,Scaler_DstWrite,Scaler_DstPitch); + Scaler_DstWrite += Scaler_DstPitch; + } + Scaler_DstWrite += Scaler_DstPitch; + } + + if (__builtin_expect(Scaler_Line==Scaler_SrcHeight,0)) { + FUNC(pIn+1); + __builtin_ia32_emms(); + } +} diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_scalers.h dosbox-0.61+hq2x/src/gui/render_scalers.h --- dosbox-0.61/src/gui/render_scalers.h 2004-06-10 09:18:19.000000000 +0200 +++ dosbox-0.61+hq2x/src/gui/render_scalers.h 2004-07-04 23:29:49.000000000 +0200 @@ -30,6 +30,7 @@ OP_AdvInterp2x, OP_Interp2x, OP_TV2x, + OP_Hq2x, }; struct ScalerBlock { @@ -46,6 +47,7 @@ extern ScalerBlock AdvInterp2x_8; extern ScalerBlock Interp2x_8; extern ScalerBlock TV2x_8; +extern ScalerBlock Hq2x_8; #endif diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/hardware/ymf262.c dosbox-0.61+hq2x/src/hardware/ymf262.c --- dosbox-0.61/src/hardware/ymf262.c 2004-03-28 15:04:45.000000000 +0200 +++ dosbox-0.61+hq2x/src/hardware/ymf262.c 2004-06-20 03:54:47.000000000 +0200 @@ -844,23 +844,52 @@ INLINE signed int op_calc(UINT32 phase, unsigned int env, signed int pm, unsigned int wave_tab) { UINT32 p; + int pos = (((signed int)((phase & ~FREQ_MASK) + (pm<<16))) >> FREQ_SH ); +#ifdef SMALL_CACHE + if ((wave_tab == 1*SIN_LEN) && (pos & (SIN_LEN>>1))) pos = 0; + if ((wave_tab == 3*SIN_LEN) && (pos & (SIN_LEN>>2))) pos = 0; + if (wave_tab == 2*SIN_LEN || wave_tab == 3*SIN_LEN) pos &= SIN_MASK>>1; + if (wave_tab == 4*SIN_LEN || wave_tab == 5*SIN_LEN) { + if (wave_tab == 5*SIN_LEN) pos &= SIN_MASK>>1; + pos *= 2; + if (pos & (SIN_LEN>>1)) pos = 0; + } + if (wave_tab != 6*SIN_LEN && wave_tab != 7*SIN_LEN) wave_tab = 0; +#endif + p = (env<<4) + sin_tab[wave_tab + (pos & SIN_MASK)]; - p = (env<<4) + sin_tab[wave_tab + ((((signed int)((phase & ~FREQ_MASK) + (pm<<16))) >> FREQ_SH ) & SIN_MASK) ]; - - if (p >= TL_TAB_LEN) - return 0; +#if 1 + return tl_tab[p&(TL_TAB_LEN/13-1)] >> (p/(TL_TAB_LEN/13)); +#else + if (p > TL_TAB_LEN) return 0; return tl_tab[p]; +#endif } INLINE signed int op_calc1(UINT32 phase, unsigned int env, signed int pm, unsigned int wave_tab) { UINT32 p; + int pos = (((signed int)((phase & ~FREQ_MASK) + pm)) >> FREQ_SH ); +#ifdef SMALL_CACHE + if ((wave_tab == 1*SIN_LEN) && (pos & (SIN_LEN>>1))) pos = 0; + if ((wave_tab == 3*SIN_LEN) && (pos & (SIN_LEN>>2))) pos = 0; + if (wave_tab == 2*SIN_LEN || wave_tab == 3*SIN_LEN) pos &= SIN_MASK>>1; + if (wave_tab == 4*SIN_LEN || wave_tab == 5*SIN_LEN) { + if (wave_tab == 5*SIN_LEN) pos &= SIN_MASK>>1; + pos *= 2; + if (pos & (SIN_LEN>>1)) pos = 0; + } + if (wave_tab != 6*SIN_LEN && wave_tab != 7*SIN_LEN) wave_tab = 0; +#endif - p = (env<<4) + sin_tab[wave_tab + ((((signed int)((phase & ~FREQ_MASK) + pm))>>FREQ_SH) & SIN_MASK)]; + p = (env<<4) + sin_tab[wave_tab + (pos & SIN_MASK)]; - if (p >= TL_TAB_LEN) - return 0; +#if 1 + return tl_tab[p&(TL_TAB_LEN/13-1)] >> (p/(TL_TAB_LEN/13)); +#else + if (p > TL_TAB_LEN) return 0; return tl_tab[p]; +#endif } diff -ruN src./dosbox.cpp src/dosbox.cpp --- dupa/src./dosbox.cpp 2004-09-30 15:15:59.000000000 +0200 +++ dupa/src/dosbox.cpp 2004-09-30 15:18:48.301932384 +0200 @@ -231,11 +231,17 @@ secprop->Add_int("frameskip",0); secprop->Add_bool("aspect",false); secprop->Add_string("scaler","normal2x"); + secprop->Add_int("hq2x_threshold_adaptive",75); + secprop->Add_int("hq2x_threshold",0); MSG_Add("RENDER_CONFIGFILE_HELP", "frameskip -- How many frames dosbox skips before drawing one.\n" "aspect -- Do aspect correction.\n" "scaler -- Scaler used to enlarge/enhance low resolution modes.\n" - " Supported are none,normal2x,advmame2x,advmame3x,advinterp2x,interp2x,tv2x.\n" + " Supported are none,normal2x,advmame2x,advmame3x,advinterp2x,interp2x,tv2x,hq2x.\n" + "hq2x_threshold_adaptive -- The adaptive threshold used to detect edges in hq2x\n" + " Possible values are 0-100, can be modified with Ctrl+Alt+F5/F6\n" + "hq2x_threshold -- The static threshold used to detect edges in hq2x\n" + " Possible values are 0-255, can be modified with Ctrl+Alt+F3/F4\n" ); secprop=control->AddSection_prop("cpu",&CPU_Init);