+++ /dev/null
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/Makefile.am dosbox-0.61+hq2x/src/gui/Makefile.am
---- dosbox-0.61/src/gui/Makefile.am 2004-07-05 02:44:22.000000000 +0200
-+++ dosbox-0.61+hq2x/src/gui/Makefile.am 2004-07-04 23:25:07.000000000 +0200
-@@ -3,5 +3,6 @@
- noinst_LIBRARIES = libgui.a
- libgui_a_SOURCES = sdlmain.cpp sdl_mapper.cpp \
- render.cpp render_scalers.cpp render_scalers.h render_templates.h \
-- midi.cpp midi_win32.h midi_oss.h midi_coreaudio.h midi_alsa.h
-+ midi.cpp midi_win32.h midi_oss.h midi_coreaudio.h midi_alsa.h \
-+ render_hq2x.cpp render_hq2x.h
-
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render.cpp dosbox-0.61+hq2x/src/gui/render.cpp
---- dosbox-0.61/src/gui/render.cpp 2004-08-05 00:12:58.732847304 +0200
-+++ dosbox-0.61+hq2x/src/gui/render.cpp 2004-08-04 23:50:12.000000000 +0200
-@@ -33,6 +33,7 @@
- #include "support.h"
-
- #include "render_scalers.h"
-+#include "render_hq2x.h"
-
- struct PalData {
- struct {
-@@ -190,6 +191,9 @@
- }
- break;
- }
-+ if (render.op.type == OP_Hq2x) {
-+ Hq2x_InitLUTs((void*)render.pal.rgb,render.pal.last,render.pal.first);
-+ }
- /* Setup pal index to startup values */
- render.pal.first=256;
- render.pal.last=0;
-@@ -314,6 +318,7 @@
- case OP_Interp2x:block=&Interp2x_8;break;
- case OP_AdvInterp2x:block=&AdvInterp2x_8;break;
- case OP_TV2x:block=&TV2x_8;break;
-+ case OP_Hq2x:block=&Hq2x_8;break;
- }
- gfx_flags=GFX_GetBestMode(block->flags);
- if (!gfx_flags) {
-@@ -362,7 +367,7 @@
-
- extern void GFX_SetTitle(Bits cycles, Bits frameskip,bool paused);
- static void IncreaseFrameSkip(void) {
-- if (render.frameskip.max<10) render.frameskip.max++;
-+ if (render.frameskip.max<25) render.frameskip.max++;
- LOG_MSG("Frame Skip at %d",render.frameskip.max);
- GFX_SetTitle(-1,render.frameskip.max,false);
- }
-@@ -376,6 +381,12 @@
- void RENDER_Init(Section * sec) {
- Section_prop * section=static_cast<Section_prop *>(sec);
-
-+ Hq2x_colourTrigger=section->Get_int("hq2x_threshold");
-+ if (Hq2x_colourTrigger > 255) Hq2x_colourTrigger = 255;
-+ if (Hq2x_colourTrigger < 0) Hq2x_colourTrigger = 0;
-+ Hq2x_colourTrigger_adaptive=section->Get_int("hq2x_threshold_adaptive");
-+ if (Hq2x_colourTrigger_adaptive > 255) Hq2x_colourTrigger_adaptive = 255;
-+ if (Hq2x_colourTrigger_adaptive <= 0) Hq2x_colourTrigger_adaptive = 75;
- render.pal.first=256;
- render.pal.last=0;
- render.aspect=section->Get_bool("aspect");
-@@ -398,12 +409,17 @@
- else if (!strcasecmp(scaler,"advinterp2x")) render.op.want_type=OP_AdvInterp2x;
- else if (!strcasecmp(scaler,"interp2x")) render.op.want_type=OP_Interp2x;
- else if (!strcasecmp(scaler,"tv2x")) render.op.want_type=OP_TV2x;
-+ else if (!strcasecmp(scaler,"hq2x")) render.op.want_type=OP_Hq2x;
- else {
- render.op.want_type=OP_Normal;
- LOG_MSG("Illegal scaler type %s,falling back to normal.",scaler);
- }
- MAPPER_AddHandler(DecreaseFrameSkip,MK_f7,MMOD1,"decfskip","Dec Fskip");
- MAPPER_AddHandler(IncreaseFrameSkip,MK_f8,MMOD1,"incfskip","Inc Fskip");
-+ MAPPER_AddHandler(Hq2x_DecreaseThreshold,MK_f3,MMOD1|MMOD2,"dechq2xthreshold","Dec Hq2x Static Threshold");
-+ MAPPER_AddHandler(Hq2x_IncreaseThreshold,MK_f4,MMOD1|MMOD2,"inchq2xthreshold","Inc Hq2x Static Threshold");
-+ MAPPER_AddHandler(Hq2x_DecreaseThresholdAdaptive,MK_f5,MMOD1|MMOD2,"dechq2xadapthreshold","Dec Hq2x Adaptive Threshold");
-+ MAPPER_AddHandler(Hq2x_IncreaseThresholdAdaptive,MK_f6,MMOD1|MMOD2,"inchq2xadapthreshold","Inc Hq2x Adaptive Threshold");
- GFX_SetTitle(-1,render.frameskip.max,false);
- }
-
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x.cpp dosbox-0.61+hq2x/src/gui/render_hq2x.cpp
---- dosbox-0.61/src/gui/render_hq2x.cpp 1970-01-01 01:00:00.000000000 +0100
-+++ dosbox-0.61+hq2x/src/gui/render_hq2x.cpp 2004-08-04 23:43:53.000000000 +0200
-@@ -0,0 +1,799 @@
-+//hq2x filter demo program
-+//----------------------------------------------------------
-+//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
-+// Speed optimization and mmx code Copyright (c) 2004 Jörg Walter (jwalt@garni.ch)
-+
-+//This program is free software; you can redistribute it and/or
-+//modify it under the terms of the GNU Lesser General Public
-+//License as published by the Free Software Foundation; either
-+//version 2.1 of the License, or (at your option) any later version.
-+//
-+//This program is distributed in the hope that it will be useful,
-+//but WITHOUT ANY WARRANTY; without even the implied warranty of
-+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+//Lesser General Public License for more details.
-+//
-+//You should have received a copy of the GNU Lesser General Public
-+//License along with this program; if not, write to the Free Software
-+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+
-+/*
-+ This code comes in three variants:
-+ 1. plain C code with live difference calculation
-+ 2. C code with live difference calculation in MMX
-+ 3. lookup-table based difference calculation
-+
-+ Which one is fastest depends on your CPU speed and cache size. The table based algorithm
-+ should be fastest if you have 32kb L1 data cache or more. Packing diff values into less
-+ bytes is possible, define DIFF_TABLE to the number of bits per int.
-+
-+ Speed: 22fps/27fps(MMX)/32fps(table) on a pentium2/333MHz
-+
-+ TODO:
-+ Currently only does 32bpp/16bpp BGRA output, and (theoretically) RGB output. YUV
-+ isn't needed anymore, it seems, so this code should now work in all setups.
-+ MMX code only does RGB, thus isn't really usable (but useful for benchmarking).
-+ This code should use the intel compiler functions for mmx, as GCC emulates
-+ them more or less completely.
-+
-+ further optimization ideas:
-+ - fix gcc bugs (shift), so Diff_mmx can run without register spilling
-+ - manual unrolling of Diff loop to get decent memory prefetch for
-+ recent CPUs
-+ - add mmxext support to Diff for faster unpacking
-+ - test if sse's movntq in interpolation loop improves things
-+ - find a way to mmxify the interpolation loop sensibly
-+ (currently runs slower than non-mmx code)
-+ - find a way to save (cache-)memory in the factors table
-+ (tighter packing and double indirection are both slower on p2)
-+ - find a way for 16bpp not to suck that hard (speed-wise)
-+*/
-+
-+
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <png.h>
-+#include <math.h>
-+
-+#include "config.h"
-+#include "dosbox.h"
-+#include "video.h"
-+#include "render_scalers.h"
-+#include "render_hq2x.h"
-+
-+#ifndef __GCC__
-+#define __attribute__(x)
-+#define __builtin_expect(x,y) x
-+#endif
-+
-+#define ADAPTIVE
-+/* #define DEBUG */
-+// #define DIFF_TABLE 32
-+#define DIFF_TABLE 1
-+
-+// Gathered experimentally, values from 0x08-0x80 are useful, depending on graphics
-+// and your personal preference.
-+long Hq2x_colourTrigger = 0;
-+long Hq2x_colourTrigger_adaptive = 75;
-+
-+#ifdef ADAPTIVE
-+#undef DIFF_TABLE
-+#define DIFF_TABLE 1
-+#endif
-+
-+#ifdef DIFF_TABLE
-+#define bits DIFF_TABLE
-+static
-+#if DIFF_TABLE == 1
-+unsigned char
-+#else
-+int
-+#endif
-+difftable[65536/bits];
-+
-+inline static unsigned int Diff1_calc(int r1, int g1, int b1, int r2, int g2, int b2)
-+{
-+ long r,g,b;
-+ long rmean;
-+ long ret;
-+
-+ rmean = r1+r2;
-+ b = b1-b2;
-+ g = g1-g2;
-+ r = r1-r2;
-+
-+#ifdef ADAPTIVE
-+ ret = (unsigned int)(((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g);
-+ return (ret < Hq2x_colourTrigger?0:ret-Hq2x_colourTrigger > 255?255:ret-Hq2x_colourTrigger);
-+#else
-+ return ((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g > Hq2x_colourTrigger;
-+#endif
-+}
-+
-+#if DIFF_TABLE == 1
-+#define Diff1(x,y) (difftable[(*(x)) * (256/bits) + ((*(y)) / bits)] >> ((*(y))%bits))
-+#else
-+#define Diff1(x,y) ((difftable[(*(x)) * (256/bits) + ((*(y)) / bits)] >> ((*(y))%bits)) & 1)
-+#endif
-+inline static int Diff(const unsigned char *l2, const unsigned char *l3)
-+{
-+#ifdef ADAPTIVE
-+ unsigned int max = Diff1(l2,l3+1);
-+ unsigned int min = max;
-+ unsigned int dynthres = Diff1(l2+1,l3);
-+ if (dynthres > max) max = dynthres;
-+ if (dynthres < min) min = dynthres;
-+ dynthres = Diff1(l2+1,l3+1);
-+ if (dynthres > max) max = dynthres;
-+ if (dynthres < min) min = dynthres;
-+ dynthres = Diff1(l3,l3+1);
-+ if (dynthres > max) max = dynthres;
-+ if (dynthres < min) min = dynthres;
-+ dynthres = (Hq2x_colourTrigger_adaptive*max+(100-Hq2x_colourTrigger_adaptive)*min)/200;
-+
-+ return ((Diff1(l2,l3+1)>dynthres)*0x00aa0055) | ((Diff1(l2+1,l3)>dynthres)*0x005500aa) | ((Diff1(l2+1,l3+1)>dynthres)*0x03000300) | ((Diff1(l3,l3+1)>dynthres)*0x0c000c00);
-+#else
-+ return (Diff1(l2,l3+1)*0x00aa0055) | (Diff1(l2+1,l3)*0x005500aa) | (Diff1(l2+1,l3+1)*0x03000300) | (Diff1(l3,l3+1)*0x0c000c00);
-+#endif
-+}
-+#undef __MMX__
-+#else
-+#ifdef __MMX__
-+/* always on for gcc for now */
-+#define MMX_ONLY
-+/* this is safe for -march=..., but not if someone specifies -mmmx manually */
-+
-+# ifdef MMX_ONLY
-+# define Diff_mmx Diff
-+# else
-+ int has_mmx = 0;
-+# define Diff(a,b) (has_mmx?Diff_mmx((a),(b)):Diff_any((a),(b)))
-+# endif
-+#else
-+# define Diff_any Diff
-+#endif
-+
-+// A better colour distance function, adapted from http://www.compuphase.com/cmetric.htm
-+#if !defined(MMX_ONLY) || defined(DEBUG)
-+
-+inline static int Diff1(const unsigned char *e1, const unsigned char* e2)
-+{
-+ long r,g,b;
-+ long rmean;
-+
-+ rmean = e1[0]+e2[0];
-+ b = e1[0]-e2[0];
-+ g = e1[1]-e2[1];
-+ r = e1[2]-e2[2];
-+
-+ return ((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g > Hq2x_colourTrigger;
-+}
-+
-+inline static int Diff_any(const unsigned long *l2, const unsigned long *l3)
-+{
-+ return (Diff1((unsigned char *)l2,(unsigned char *)(l3+1))*0x00aa0055) | (Diff1((unsigned char *)(l2+1),(unsigned char *)l3)*0x005500aa) | (Diff1((unsigned char *)(l2+1),(unsigned char *)(l3+1))*0x03000300) | (Diff1((unsigned char *)l3,(unsigned char *)(l3+1))*0x0c000c00);
-+}
-+#endif
-+
-+#ifdef __MMX__
-+typedef int mmx_1_64 __attribute__((mode(DI)));
-+typedef int mmx_2_32 __attribute__((mode(V2SI)));
-+typedef int mmx_4_16 __attribute__((mode(V4HI)));
-+typedef int mmx_8_8 __attribute__((mode(V8QI)));
-+
-+static mmx_4_16 mmx_trigger;
-+
-+/* Note: this needs BGRA pixel layout, with the A component replaced by (-R)+32 */
-+inline static int Diff_mmx(const unsigned long *e1, const unsigned long *e2)
-+{
-+ mmx_4_16 mm0, mm1, mm2, mm3, mm4;
-+#ifdef DEBUG
-+ mmx_4_16 t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17,t18, m0, m1;
-+ #define d(x) x =
-+#else
-+#define d(x)
-+#endif
-+ const mmx_8_8 zero = (mmx_8_8)0x0ULL;
-+ const mmx_4_16 rmean_off = (mmx_4_16)(0x0a000a000a000a00ULL);
-+ const mmx_4_16 factors = (mmx_4_16)0xfffdfff4aa5655abULL;
-+
-+ /* -1 * aa56 = 0101 0101 1010 1010 */
-+ /* -1 * 55ab = 1010 1010 0101 0101 */
-+
-+ /*
-+ Read from memory:
-+ */
-+ mm1 = *(mmx_4_16 *)e2;
-+ mm0 = *(mmx_4_16 *)e1;
-+
-+ /* (high ................................ low)
-+ -p2r+32, p2b, p2g, p2r, -p1r+32, p1b, p1g, p1r = mm0
-+ -p5r+32, p5b, p5g, p5r, -p4r+32, p4b, p4g, p4r = mm1
-+
-+ Shuffle dwords so we get 4 registers with pixel
-+ arrangement ready for difference calculation:
-+
-+ (2, 5, 4, 1) - (5, 4, 2, 5)
-+
-+ We choose (rrrr, gggg) + (bbbb, rrrr) layout. This
-+ is quite expensive, given that difference calculation
-+ in (rgbr, rgbr) form would need just two unpacks, but
-+ the unpacking has to be done sooner or later, and
-+ this pixel layout makes later calculations cheaper.
-+ TODO: sse/mmxext version of this unpacking should be
-+ much cheaper.
-+
-+ -p4r+32, -p1r+32, p4b, p1b, p4g, p1g, p4r, p1r = mm0
-+ -p5r+32, p5b, p5g, p5r, -p5r+32, p5b, p5g, p5r = mm3 (temp)
-+ -p2r+32, -p5r+32, p2b, p5b, p2g, p5g, p2r, p5r = mm2
-+ -p5r+32, -p4r+32, p5b, p4b, p5g, p4g, p5r, p4r = mm1
-+*/
-+
-+ mm3 = (mmx_4_16)__builtin_ia32_punpckhdq((mmx_2_32)mm1,(mmx_2_32)mm1);
-+ mm2 = (mmx_4_16)__builtin_ia32_punpckhbw((mmx_8_8)mm1,(mmx_8_8)mm0);
-+ mm0 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm0,(mmx_8_8)mm1);
-+ mm1 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm1,(mmx_8_8)mm3);
-+
-+/*
-+ ... continued ...
-+
-+ -p2r+32, -p5r+32, -p4r+32, -p1r+32, p2b, p5b, p4b, p1b = mm0
-+ p2g, p5g, p4g, p1g, p2r, p5r, p4r, p1r = mm3
-+
-+ -p5r+32, -p4r+32, -p2r+32, -p5r+32, p5b, p4b, p2b, p5b = mm4
-+ p5g, p4g, p2g, p5g, p5r, p4r, p2r, p5r = mm2
-+
-+*/
-+
-+ mm3 = (mmx_4_16)__builtin_ia32_punpcklwd(mm0,mm2);
-+ mm0 = (mmx_4_16)__builtin_ia32_punpckhwd(mm0,mm2);
-+ mm4 = (mmx_4_16)__builtin_ia32_punpckhwd(mm2,mm1);
-+ mm2 = (mmx_4_16)__builtin_ia32_punpcklwd(mm2,mm1);
-+
-+ /*
-+ Put mm2 with negated red component into mm1. Negation is done
-+ in the lookup table.
-+
-+ -p2r+32, -p5r+32, -p4r+32, -p1r+32, p2b, p5b, p4b, p1b = mm0
-+ p2g, p5g, p4g, p1g, p2r, p5r, p4r, p1r = mm3
-+
-+ -p5r+32, -p4r+32, -p2r+32, -p5r+32, p5b, p4b, p2b, p5b = mm4
-+ p5g, p4g, p2g, p5g, -p5r+32, -p4r+32, -p2r+32, -p5r+32 = mm2
-+
-+ */
-+
-+ mm2 = (mmx_4_16)__builtin_ia32_punpckhdq((mmx_2_32)mm4,(mmx_2_32)mm2);
-+
-+ /*
-+
-+ Calculate the differences (and rmean)
-+ mm0-mm4, mm3-mm2 (signed saturation)
-+
-+ d1r/8, d3r/8, d8r/8, d0r/8, d1b/8, d3b/8, d8b/8, d0b/8 = mm0
-+ d1g/8, d3g/8, d8g/8, d0g/8, d1rmean/4-32, d3rmean/4-32, d8rmean/4-32, d0rmean/4-32 = mm3
-+
-+ */
-+
-+ mm0 = (mmx_4_16)__builtin_ia32_psubsb((mmx_8_8)mm0,(mmx_8_8)mm4);
-+ mm3 = (mmx_4_16)__builtin_ia32_psubsb((mmx_8_8)mm3,(mmx_8_8)mm2);
-+
-+#ifdef DEBUG
-+{
-+ int p1 = e1[0];
-+ int p5 = e2[1];
-+ char *cmm0 = (void*)&mm0, *cmm3 = (void*)&mm3;
-+ if (((int)cmm3[0]) != ((int)(p1&Rmask) + (int)(p5&Rmask) - 32)) abort();
-+ if (((int)cmm0[4]) != -((int)(p1&Rmask) - (int)(p5&Rmask))) abort();
-+}
-+#endif
-+
-+ /* Intermediate stats:
-+
-+ (rough) code equivalent:
-+ rmean = (((int)(e1&Rmask) + (int)(e2&Rmask)) >> 16) - 32;
-+ r = ((int)(e2&Rmask) - (int)(e1&Rmask)) >> 16;
-+ g = ((int)(e1&Gmask) - (int)(e2&Gmask)) >> 8;
-+ b = ((int)(e1&Bmask) - (int)(e2&Bmask));
-+
-+ Gain:
-+ 1 distance w/o mmx = 16 ops
-+ 4 distances w/ mmx = 11 ops
-+ (possible parallelism left to the compiler)
-+
-+ Todo:
-+ ((160+rmean)*r/8*r/8) + 256*g/8*g/8 + ((160-rmean)*b/8*b/8)
-+
-+ (slightly incorrect: the result is the true difference plus (b/8)^2, but
-+ this eliminates a constant, making the algorithm fit into the available
-+ 8 registers)
-+
-+ d1r/8, d3r/8, d8r/8, d0r/8, d1b/8, d3b/8, d8b/8, d0b/8 = mm0
-+ d1g/8, d3g/8, d8g/8, d0g/8, d1rmean/4-32, d3rmean/4-32, d8rmean/4-32, d0rmean/4-32 = mm3
-+ */
-+
-+ /*
-+ prepare differences for final calculation:
-+
-+ 00 d1r/2 00 d3r/2 00 d8r/2 00 d0r/2 = mm0
-+ 00 d1b/2 00 d3b/2 00 d8b/2 00 d0b/2 = mm1
-+ 00 d1g/8 00 d3g/8 00 d8g/8 00 d0g/8 = mm2
-+ 00 (d1rmean/4-32)*16 00 (d3rmean/4-32)*16 00 (d8rmean/4-32)*16 00 (d0rmean/4-32)*16 = mm3
-+
-+ */
-+
-+#ifdef DEBUG
-+ m0 = mm0;
-+ m1 = mm3;
-+#endif
-+ // TODO: compiler error at __builtin_ia32_psllb(mm0,2);
-+ d(t0) mm0 = __builtin_ia32_pmullw(mm0,(mmx_4_16)(0x0004000400040004ULL));
-+ d(t1) mm1 = (mmx_4_16)__builtin_ia32_punpcklbw(zero, (mmx_8_8)mm0);
-+ d(t2) mm0 = (mmx_4_16)__builtin_ia32_punpckhbw(zero, (mmx_8_8)mm0);
-+ d(t3) mm2 = (mmx_4_16)__builtin_ia32_punpckhbw(zero, (mmx_8_8)mm3);
-+ d(t4) mm3 = (mmx_4_16)__builtin_ia32_punpcklbw(zero, (mmx_8_8)mm3);
-+ // TODO: compiler error at __builtin_ia32_psraw(mm3,4);
-+ d(t5) mm3 = __builtin_ia32_pmulhw(mm3,(mmx_4_16)(0x1000100010001000ULL));
-+
-+ /*
-+ intermediate results: squares and rmean factors
-+
-+ 00 (d1r/2)^2 00 (d3r/2)^2 00 (d8r/2)^2 00 (d0r/2)^2 = mm0
-+ 00 (d1b/2)^2 00 (d3b/2)^2 00 (d8b/2)^2 00 (d0b/2)^2 = mm1
-+ 00 256*(d1g/8)^2 00 256*(d3g/8)^2 00 256*(d8g/8)^2 00 256*(d0g/8)^2 = mm2
-+ 00 128+d1rmean/4 00 128+d3rmean/4 00 128+d8rmean/4 00 128+d0rmean/4 = mm3
-+ 00 192-d1rmean/4 00 192-d3rmean/4 00 192-d8rmean/4 00 192-d0rmean/4 = mm4
-+
-+ */
-+
-+ d(t9) mm0 = __builtin_ia32_pmulhw(mm0,mm0);
-+ d(t10) mm1 = __builtin_ia32_pmulhw(mm1,mm1);
-+ d(t11) mm2 = __builtin_ia32_pmulhw(mm2,mm2);
-+ d(t12) mm4 = __builtin_ia32_psubsw(rmean_off,mm3);
-+ d(t13) mm3 = __builtin_ia32_paddsw(mm3,rmean_off);
-+
-+ /*
-+ intermediate results: finish red and blue components
-+
-+ 00 (128+d1rmean/4)*(d1r/8)^2 00 (128+d3rmean/4)*(d3r/8)^2 00 (128+d8rmean/4)*(d8r/8)^2 00 (128+d0rmean/4)*(d0r/8)^2 = mm0
-+ 00 (192-d1rmean/4)*(d1b/8)^2 00 (192-d3rmean/4)*(d3b/8)^2 00 (192-d8rmean/4)*(d8b/8)^2 00 (192-d0rmean/4)*(d0b/8)^2 = mm1
-+ 00 (d1g/8)^2 00 (d3g/8)^2 00 (d8g/8)^2 00 (d0g/8)^2 = mm2
-+
-+ */
-+
-+ d(t15) mm1 = __builtin_ia32_pmulhw(mm1, mm4);
-+ d(t16) mm0 = __builtin_ia32_pmulhw(mm0, mm3);
-+
-+ /*
-+ calculate final visual difference
-+
-+ (128+rmean/4)*(r/8)^2+(192-rmean/4)*(b/8)^2+256*(g/8)^2 = mm0 (order: 1 3 8 0)
-+ */
-+
-+ d(t17) mm0 = __builtin_ia32_paddw(mm0,mm1);
-+ d(t18) mm0 = __builtin_ia32_paddw(mm0,mm2);
-+
-+#ifdef DEBUG
-+{
-+ int p1 = e1[0];
-+ int p5 = e2[1];
-+ short *smm0 = (void*)&mm0;
-+ long r,g,b;
-+ long rmean, diff;
-+ rmean = (((p1+p5)&Rmask)-32);
-+ r = (p1&Rmask)-(p5&Rmask);
-+ g = ((p1&Gmask) - (p5&Gmask)) >> 8;
-+ b = ((p1&Bmask) - (p5&Bmask)) >> 16;
-+
-+ diff = ((160+rmean)*r*r + 256*g*g + (160-rmean)*b*b)/256;
-+ if (diff > smm0[0]+1 || diff < smm0[0]-1) abort();
-+}
-+#endif
-+
-+ /*
-+ Code equivalent:
-+ ((((512+rmean)>>8)*r*r) + 4*g*g + (((768-rmean)>>8)*b*b))
-+
-+ test against threshold
-+
-+ (diff1?0xffff:0x0000) (diff3?0xffff:0x0000) (diff8?0xffff:0x0000) (diff0?0x0xffff:0x0000) = mm0
-+ */
-+
-+ mm0 = __builtin_ia32_pcmpgtw(mm0,mmx_trigger);
-+
-+ /*
-+ create final bit patterns
-+
-+ 0000 0000 (diff1*0x03000300)|(diff3*0x0c000c00)|(diff8*0x005500aa)|(diff0*0x00aa0055)
-+
-+ */
-+
-+ mm0 = (mmx_4_16)__builtin_ia32_pmaddwd(mm0,factors);
-+ mm0 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm0,(mmx_8_8)__builtin_ia32_punpckhbw((mmx_8_8)mm0,(mmx_8_8)mm0));
-+
-+ return (unsigned long)(unsigned long long)mm0;
-+ /*
-+ Total: 11+16+3 = 30 ops for 4 distances vs. 16+13+7 = 36 ops for 1 distance
-+ */
-+}
-+#endif
-+#endif
-+
-+static int LUTPAL8to32[256] __attribute__((aligned(32)));
-+//#define factors(a,b,c,d) (((a)-1) | ((b)<<2) | ((c)<<4) | ((d)<<6))
-+#define P0 {8,0,0,0}
-+#define P10 {6,2,0,0}
-+#define P11 {6,0,0,2}
-+#define P12 {6,0,2,0}
-+#define P20 {4,0,2,2}
-+#define P21 {4,2,2,0}
-+#define P22 {4,2,0,2}
-+#define P60 {5,0,2,1}
-+#define P61 {5,0,1,2}
-+#define P70 {6,0,1,1}
-+#define P90 {2,0,3,3}
-+#define P100 {7,0,0,1}
-+#define X {0,0,0,0}
-+#define UNUSED X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, \
-+ X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X
-+
-+/* sparse table: only 2k entries are used */
-+static unsigned char factors[4096][4] __attribute__((aligned(32))) = {
-+/* 0000 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0040 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0080 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 00c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0100 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0140 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0180 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 01c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0200 */ UNUSED,
-+/* 0400 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0440 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0480 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 04c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0500 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0540 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0580 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 05c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0600 */ UNUSED,
-+/* 0800 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0840 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0880 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 08c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0900 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0940 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0980 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 09c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 0a00 */ UNUSED,
-+/* 0c00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0c40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0c80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0cc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0d00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0d40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0d80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0dc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 0e00 */ UNUSED,
-+/* 1000 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1040 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1080 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 10c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1100 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1140 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1180 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 11c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1200 */ UNUSED,
-+/* 1400 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
-+/* 1440 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
-+/* 1480 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
-+/* 14c0 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
-+/* 1500 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
-+/* 1540 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
-+/* 1580 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
-+/* 15c0 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
-+/* 1600 */ UNUSED,
-+/* 1800 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
-+/* 1840 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
-+/* 1880 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
-+/* 18c0 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
-+/* 1900 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1940 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1980 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 19c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 1a00 */ UNUSED,
-+/* 1c00 */ P20, P20, P0, P0, P20, P20, P0, P0, P10, P20, P10, P0, P10, P20, P10, P0,
-+/* 1c40 */ P20, P20, P0, P0, P20, P20, P0, P0, P10, P20, P10, P0, P10, P20, P10, P0,
-+/* 1c80 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 1cc0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 1d00 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
-+/* 1d40 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
-+/* 1d80 */ P70, P90, P10, P0, P70, P90, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 1dc0 */ P70, P90, P10, P0, P70, P90, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 1e00 */ UNUSED,
-+/* 2000 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2040 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2080 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 20c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2100 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2140 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2180 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 21c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2200 */ UNUSED,
-+/* 2400 */ P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, /* above */
-+/* 2440 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2480 */ P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60,
-+/* 24c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2500 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2540 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2580 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 25c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2600 */ UNUSED,
-+/* 2800 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2840 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2880 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 28c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2900 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2940 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2980 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 29c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
-+/* 2a00 */ UNUSED,
-+/* 2c00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, /* rot */
-+/* 2c40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2c80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P60, P22, P60, P22, P60, P22, P60,
-+/* 2cc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2d00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2d40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2d80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2dc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
-+/* 2e00 */ UNUSED,
-+/* 3000 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3040 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3080 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 30c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3100 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3140 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3180 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 31c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3200 */ UNUSED,
-+/* 3400 */ P20, P20, P0, P0, P20, P20, P0, P0, P70, P20, P10, P0, P70, P20, P10, P0,
-+/* 3440 */ P20, P20, P0, P0, P20, P20, P0, P0, P70, P20, P10, P0, P70, P20, P10, P0,
-+/* 3480 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 34c0 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 3500 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P90, P10, P0, P70, P90, P10, P0,
-+/* 3540 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P90, P10, P0, P70, P90, P10, P0,
-+/* 3580 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 35c0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 3600 */ UNUSED,
-+/* 3800 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3840 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3880 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
-+/* 38c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
-+/* 3900 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3940 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3980 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 39c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
-+/* 3a00 */ UNUSED,
-+/* 3c00 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
-+/* 3c40 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
-+/* 3c80 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 3cc0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 3d00 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P20, P10, P0, P10, P20, P10, P0,
-+/* 3d40 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P20, P10, P0, P10, P20, P10, P0,
-+/* 3d80 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 3dc0 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
-+/* 3e00 */ UNUSED
-+};
-+
-+/* Memory usage at 320 pixels width:
-+
-+ ~4k line buffer
-+ ~2.5k pattern buffer
-+ 8k factor table (can be packed into as less as 2k, but unpacking takes more time than we lose due
-+ to cache trashing, at least on a pentium2)
-+ 1k palette table
-+------
-+~15.5k data
-+
-+If the diff table is used, another 8k (packed) / 64k (unpacked) are used.
-+*/
-+
-+/* Optimizations:
-+
-+Pixel/Pattern layout:
-+
-+1 2 3
-+ a c
-+4 5 6
-+ b d
-+7 8 9
-+
-+Factor storage:
-+Pixel: 5 1 2 4
-+
-+ 0: 8 0 0 0
-+ 10: 6 2 0 0
-+ 11: 6 0 0 2
-+ 12: 6 0 2 0
-+ 20: 4 0 2 2
-+ 21: 4 2 2 0
-+ 22: 4 2 0 2
-+ 60: 5 0 2 1
-+ 61: 5 0 1 2
-+ 70: 6 0 1 1
-+ 90: 2 0 3 3
-+100: 7 0 1 0
-+
-+bits: 3 1 2 2 = 8 bit
-+
-+Factor set 100 is wrong: 7 0 0.5 0.5 would be 100% like the original, but
-+since pixel 2 and 4 are visually close for all patterns where this set is
-+used, this important simplification should not be visible.
-+
-+Pattern usage:
-+
-+a b c d
-+0 8 1 3 10 5 6 (a) 9 2 (b) 4 7 (11) (c) (d) -> 1 2 4 5
-+
-+0 8 10 5 9 2 7 x 1 6 3 4
-+
-+cr+a2 dr+b2 ar br
-+2 9 1 4 11 7 6 (d) 8 0 (x) 3 5 (10) (y) (a) -> 3 2 6 5
-+
-+br+a3 ar dr+c3 cr
-+5 10 6 3 8 0 1 (m) 11 7 (c) 4 2 (9) (b) (n) -> 7 8 4 5
-+
-+d+b2+c3 c+a2 b+a3 a
-+7 11 6 4 9 2 1 (n) 10 5 (y) 3 0 (8) (x) (m) -> 9 8 6 5
-+
-+
-+Pattern storage:
-+
-+0123 = \/|_
-+
-+0101 010x 2323
-+1010 101x 2323
-+
-+*/
-+
-+union pattern {
-+ unsigned short p[2];
-+ unsigned long value;
-+};
-+
-+static unsigned long lines0[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
-+static unsigned long lines1[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
-+static unsigned long lines2[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
-+static unsigned long *l1, *l2, *l3, *tmp;
-+static union pattern p0[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
-+static union pattern p1[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
-+static union pattern *top, *bot, *ptmp;
-+static unsigned char prev[Hq2x_MAXWIDTH+2];
-+
-+#ifdef DIFF_TABLE
-+#define diffcall bot[i].value = Diff(prev+i,pIn+i); memcpy(prev,pIn,Scaler_SrcWidth+1);
-+#else
-+#define diffcall bot[i].value = Diff(l2+i,l3+i);
-+#endif
-+
-+#ifndef __MMX__
-+#define __builtin_ia32_emms()
-+#endif
-+
-+#define CONSTCHECK if (Scaler_SrcWidth == 320) RENDER_DrawLine =
-+
-+#define store(out,index,x,y) do{((unsigned long*)out)[index*2] = 0xff000000|(x); ((unsigned long*)out)[index*2+1] = 0xff000000|(y);}while(0)
-+#define type long
-+
-+#define FUNC Hq2x_long_320_line
-+#define Scaler_SrcWidth 320
-+#define CHECK_CONST
-+#include "render_hq2x_template.h"
-+#undef CHECK_CONST
-+#undef Scaler_SrcWidth
-+#undef FUNC
-+
-+#define FUNC Hq2x_long_Scaler_SrcWidth_line
-+#define CHECK_CONST CONSTCHECK Hq2x_long_320_line;
-+#include "render_hq2x_template.h"
-+#undef store
-+#undef type
-+#undef CHECK_CONST
-+#undef FUNC
-+
-+/* 16 bit support */
-+#ifdef WORDS_BIGENDIAN
-+#define store(out,index,y,x) ((unsigned long *)out)[index] = (((((x)>>3)&0x1f)|(((y)<<13))&0x1f0000)|((((x)>>5)&0x7e0)|(((y)<<11))&0x7e00000)|((((x)>>8)&0xf800)|(((y)<<8))&0xf8000000))
-+#else
-+#define store(out,index,x,y) ((unsigned long *)out)[index] = (((((x)>>3)&0x1f)|(((y)<<13))&0x1f0000)|((((x)>>5)&0x7e0)|(((y)<<11))&0x7e00000)|((((x)>>8)&0xf800)|(((y)<<8))&0xf8000000))
-+#endif
-+#define type short
-+
-+#define FUNC Hq2x_short_320_line
-+#define Scaler_SrcWidth 320
-+#define CHECK_CONST
-+#include "render_hq2x_template.h"
-+#undef CHECK_CONST
-+#undef Scaler_SrcWidth
-+#undef FUNC
-+
-+#define FUNC Hq2x_short_Scaler_SrcWidth_line
-+#define CHECK_CONST CONSTCHECK Hq2x_short_320_line;
-+#include "render_hq2x_template.h"
-+#undef store
-+#undef type
-+#undef CHECK_CONST
-+#undef FUNC
-+
-+ScalerBlock Hq2x_8={
-+ CAN_16|CAN_32|LOVE_32|NEED_RGB,
-+ 2,2,1,
-+ 0,Hq2x_short_Scaler_SrcWidth_line,Hq2x_short_Scaler_SrcWidth_line,Hq2x_long_Scaler_SrcWidth_line
-+};
-+
-+void Hq2x_InitLUTs(const void *pal, int palette_end, int palette_start)
-+{
-+ int i, j;
-+ struct GFX_PalEntry *palette = (struct GFX_PalEntry *)pal;
-+
-+ // All componets are reduced to 5 bit (VGA palette has 6 bit)
-+ // for simpler multiplication and storage (divided by 8)
-+ for (i=palette_start; i<=palette_end; i++) {
-+ // 5 significant bits with 3 bit multiplier fit into 8 bit, thus
-+ // plain int multiplication can be used without tricks
-+ // R is duplicated into A, negated and increased by 32 for some
-+ // nice mmx distance calculation tricks
-+ LUTPAL8to32[i] = ((palette[i].r&0xf8) << 13) | ((palette[i].g&0xf8) << 5) | ((palette[i].b&0xf8) >> 3) | ((32*8-(palette[i].r&0xf8)) << 21);
-+ }
-+
-+#ifdef DIFF_TABLE
-+#if DIFF_TABLE != 1
-+ memset(difftable,0,sizeof(difftable));
-+#endif
-+ for (i = 0; i < 256; i++) {
-+ for (j = 0; j < 256; j++) {
-+ difftable[(i) * (256/bits) + ((j) / bits)]
-+#if DIFF_TABLE == 1
-+ =
-+#else
-+ |=
-+#endif
-+ Diff1_calc((LUTPAL8to32[i]>>16)&0x1f,(LUTPAL8to32[i]>>8)&0x1f,(LUTPAL8to32[i])&0x1f, (LUTPAL8to32[j]>>16)&0x1f,(LUTPAL8to32[j]>>8)&0x1f,(LUTPAL8to32[j])&0x1f) << (j%bits);
-+ }
-+ }
-+#endif
-+
-+#ifdef __MMX__
-+ *((short *)(&mmx_trigger)) = Hq2x_colourTrigger;
-+ *(((short *)(&mmx_trigger))+1) = Hq2x_colourTrigger;
-+ *(((short *)(&mmx_trigger))+2) = Hq2x_colourTrigger;
-+ *(((short *)(&mmx_trigger))+3) = Hq2x_colourTrigger;
-+#endif
-+}
-+
-+void Hq2x_IncreaseThreshold(void)
-+{
-+ if (Hq2x_colourTrigger < 255) Hq2x_colourTrigger++;
-+ Hq2x_InitLUTs(0,0,1);
-+ LOG_MSG("Hq2x threshold at %i",Hq2x_colourTrigger);
-+}
-+
-+void Hq2x_DecreaseThreshold(void)
-+{
-+ if (Hq2x_colourTrigger > 0) Hq2x_colourTrigger--;
-+ Hq2x_InitLUTs(0,0,1);
-+ LOG_MSG("Hq2x threshold at %i",Hq2x_colourTrigger);
-+}
-+
-+void Hq2x_IncreaseThresholdAdaptive(void)
-+{
-+ if (Hq2x_colourTrigger_adaptive < 100) Hq2x_colourTrigger_adaptive++;
-+ LOG_MSG("Hq2x adaptive threshold at %i",Hq2x_colourTrigger_adaptive);
-+}
-+
-+void Hq2x_DecreaseThresholdAdaptive(void)
-+{
-+ if (Hq2x_colourTrigger_adaptive > 0) Hq2x_colourTrigger_adaptive--;
-+ LOG_MSG("Hq2x adaptive threshold at %i",Hq2x_colourTrigger_adaptive);
-+}
-+
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x.h dosbox-0.61+hq2x/src/gui/render_hq2x.h
---- dosbox-0.61/src/gui/render_hq2x.h 1970-01-01 01:00:00.000000000 +0100
-+++ dosbox-0.61+hq2x/src/gui/render_hq2x.h 2004-08-02 20:34:46.000000000 +0200
-@@ -0,0 +1,31 @@
-+//derived from the hq2x filter demo program
-+//----------------------------------------------------------
-+//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
-+// Speed optimization and mmx code Copyright (c) 2004 Jörg Walter (jwalt@garni.ch)
-+
-+//This program is free software; you can redistribute it and/or
-+//modify it under the terms of the GNU Lesser General Public
-+//License as published by the Free Software Foundation; either
-+//version 2.1 of the License, or (at your option) any later version.
-+//
-+//This program is distributed in the hope that it will be useful,
-+//but WITHOUT ANY WARRANTY; without even the implied warranty of
-+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+//Lesser General Public License for more details.
-+//
-+//You should have received a copy of the GNU Lesser General Public
-+//License along with this program; if not, write to the Free Software
-+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-+
-+#ifndef __HQ2X_H
-+#define __HQ2X_H
-+
-+#define Hq2x_MAXWIDTH 640-2
-+extern long Hq2x_colourTrigger;
-+extern long Hq2x_colourTrigger_adaptive;
-+extern void Hq2x_InitLUTs(const void *palette, int palette_end, int palette_start);
-+extern void Hq2x_IncreaseThreshold(void);
-+extern void Hq2x_DecreaseThreshold(void);
-+extern void Hq2x_IncreaseThresholdAdaptive(void);
-+extern void Hq2x_DecreaseThresholdAdaptive(void);
-+#endif
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x_template.h dosbox-0.61+hq2x/src/gui/render_hq2x_template.h
---- dosbox-0.61/src/gui/render_hq2x_template.h 1970-01-01 01:00:00.000000000 +0100
-+++ dosbox-0.61+hq2x/src/gui/render_hq2x_template.h 2004-08-02 17:27:01.000000000 +0200
-@@ -0,0 +1,78 @@
-+static void FUNC(const unsigned char *pIn)
-+{
-+ int i, j;
-+ unsigned int factor, value1, value2, linesa = (*Scaler_Index++)+1, linesb = linesa/2;
-+ linesa -= linesb;
-+
-+ pIn--;
-+ if (__builtin_expect(Scaler_Line++==0,0)) {
-+ int i;
-+ CHECK_CONST
-+
-+ Scaler_DstWrite -= 2*sizeof(type);
-+ l1 = lines0;
-+ l2 = lines1;
-+ l3 = lines2;
-+ for (i=0; i <= Scaler_SrcWidth+1; i++) l2[i] = 0x20000000;
-+ l3[0] = 0x20000000;
-+ l3[1] = LUTPAL8to32[pIn[1]];
-+
-+ for (i=2; i<=Scaler_SrcWidth+1; i++)
-+ l3[i] = LUTPAL8to32[pIn[i]];
-+
-+ top = p0;
-+ bot = p1;
-+ memcpy(prev,pIn,Scaler_SrcWidth+1);
-+ for (i=1; i <= Scaler_SrcWidth; i++) diffcall
-+ return;
-+ }
-+
-+ tmp = l1; l1 = l2; l2 = l3; l3 = tmp;
-+ ptmp = top; top = bot; bot = ptmp;
-+ bot[0].value = 0x07ff07ff;
-+
-+ l3[0] = 0x20000000;
-+ l3[1] = LUTPAL8to32[pIn[1]];
-+
-+ for (i=2; i<=Scaler_SrcWidth+1; i++)
-+ l3[i] = LUTPAL8to32[pIn[i]];
-+
-+ for (i=1; i<=Scaler_SrcWidth; i++) diffcall
-+
-+ if (linesa > 0) {
-+ for (i=1; i<=Scaler_SrcWidth; i++) {
-+ factor = (top[i-1].p[0]&0x503)|(bot[i-1].p[0]&0x20c)|(top[i].p[0]&0x830)|(bot[i].p[0]&0x040);
-+ value1 = (l1[i-1]*factors[factor][1]+l1[i]*factors[factor][2]+l2[i-1]*factors[factor][3]+l2[i]*factors[factor][0]);
-+
-+ factor = (top[i-1].p[1]&0x930)|(bot[i-1].p[1]&0x240)|(top[i].p[1]&0x403)|(bot[i].p[1]&0x00c);
-+ value2 = (l1[i+1]*factors[factor][1]+l1[i]*factors[factor][2]+l2[i+1]*factors[factor][3]+l2[i]*factors[factor][0]);
-+ store(Scaler_DstWrite,i,value1,value2);
-+ }
-+ while (--linesa) {
-+ memcpy(Scaler_DstWrite+Scaler_DstPitch,Scaler_DstWrite,Scaler_DstPitch);
-+ Scaler_DstWrite += Scaler_DstPitch;
-+ }
-+ Scaler_DstWrite += Scaler_DstPitch;
-+ }
-+
-+ if (linesb > 0) {
-+ for (i=1; i <= Scaler_SrcWidth; i++) {
-+ factor = (top[i-1].p[1]&0x60c)|(bot[i-1].p[1]&0x103)|(top[i].p[1]&0x840)|(bot[i].p[1]&0x030);
-+ value1 = (l3[i-1]*factors[factor][1]+l3[i]*factors[factor][2]+l2[i-1]*factors[factor][3]+l2[i]*factors[factor][0]);
-+
-+ factor = (top[i-1].p[0]&0xa40)|(bot[i-1].p[0]&0x130)|(top[i].p[0]&0x40c)|(bot[i].p[0]&0x003);
-+ value2 = (l3[i+1]*factors[factor][1]+l3[i]*factors[factor][2]+l2[i+1]*factors[factor][3]+l2[i]*factors[factor][0]);
-+ store(Scaler_DstWrite,i,value1,value2);
-+ }
-+ while (--linesb) {
-+ memcpy(Scaler_DstWrite+Scaler_DstPitch,Scaler_DstWrite,Scaler_DstPitch);
-+ Scaler_DstWrite += Scaler_DstPitch;
-+ }
-+ Scaler_DstWrite += Scaler_DstPitch;
-+ }
-+
-+ if (__builtin_expect(Scaler_Line==Scaler_SrcHeight,0)) {
-+ FUNC(pIn+1);
-+ __builtin_ia32_emms();
-+ }
-+}
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_scalers.h dosbox-0.61+hq2x/src/gui/render_scalers.h
---- dosbox-0.61/src/gui/render_scalers.h 2004-06-10 09:18:19.000000000 +0200
-+++ dosbox-0.61+hq2x/src/gui/render_scalers.h 2004-07-04 23:29:49.000000000 +0200
-@@ -30,6 +30,7 @@
- OP_AdvInterp2x,
- OP_Interp2x,
- OP_TV2x,
-+ OP_Hq2x,
- };
-
- struct ScalerBlock {
-@@ -46,6 +47,7 @@
- extern ScalerBlock AdvInterp2x_8;
- extern ScalerBlock Interp2x_8;
- extern ScalerBlock TV2x_8;
-+extern ScalerBlock Hq2x_8;
-
-
- #endif
-diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/hardware/ymf262.c dosbox-0.61+hq2x/src/hardware/ymf262.c
---- dosbox-0.61/src/hardware/ymf262.c 2004-03-28 15:04:45.000000000 +0200
-+++ dosbox-0.61+hq2x/src/hardware/ymf262.c 2004-06-20 03:54:47.000000000 +0200
-@@ -844,23 +844,52 @@
- INLINE signed int op_calc(UINT32 phase, unsigned int env, signed int pm, unsigned int wave_tab)
- {
- UINT32 p;
-+ int pos = (((signed int)((phase & ~FREQ_MASK) + (pm<<16))) >> FREQ_SH );
-+#ifdef SMALL_CACHE
-+ if ((wave_tab == 1*SIN_LEN) && (pos & (SIN_LEN>>1))) pos = 0;
-+ if ((wave_tab == 3*SIN_LEN) && (pos & (SIN_LEN>>2))) pos = 0;
-+ if (wave_tab == 2*SIN_LEN || wave_tab == 3*SIN_LEN) pos &= SIN_MASK>>1;
-+ if (wave_tab == 4*SIN_LEN || wave_tab == 5*SIN_LEN) {
-+ if (wave_tab == 5*SIN_LEN) pos &= SIN_MASK>>1;
-+ pos *= 2;
-+ if (pos & (SIN_LEN>>1)) pos = 0;
-+ }
-+ if (wave_tab != 6*SIN_LEN && wave_tab != 7*SIN_LEN) wave_tab = 0;
-+#endif
-+ p = (env<<4) + sin_tab[wave_tab + (pos & SIN_MASK)];
-
-- p = (env<<4) + sin_tab[wave_tab + ((((signed int)((phase & ~FREQ_MASK) + (pm<<16))) >> FREQ_SH ) & SIN_MASK) ];
--
-- if (p >= TL_TAB_LEN)
-- return 0;
-+#if 1
-+ return tl_tab[p&(TL_TAB_LEN/13-1)] >> (p/(TL_TAB_LEN/13));
-+#else
-+ if (p > TL_TAB_LEN) return 0;
- return tl_tab[p];
-+#endif
- }
-
- INLINE signed int op_calc1(UINT32 phase, unsigned int env, signed int pm, unsigned int wave_tab)
- {
- UINT32 p;
-+ int pos = (((signed int)((phase & ~FREQ_MASK) + pm)) >> FREQ_SH );
-+#ifdef SMALL_CACHE
-+ if ((wave_tab == 1*SIN_LEN) && (pos & (SIN_LEN>>1))) pos = 0;
-+ if ((wave_tab == 3*SIN_LEN) && (pos & (SIN_LEN>>2))) pos = 0;
-+ if (wave_tab == 2*SIN_LEN || wave_tab == 3*SIN_LEN) pos &= SIN_MASK>>1;
-+ if (wave_tab == 4*SIN_LEN || wave_tab == 5*SIN_LEN) {
-+ if (wave_tab == 5*SIN_LEN) pos &= SIN_MASK>>1;
-+ pos *= 2;
-+ if (pos & (SIN_LEN>>1)) pos = 0;
-+ }
-+ if (wave_tab != 6*SIN_LEN && wave_tab != 7*SIN_LEN) wave_tab = 0;
-+#endif
-
-- p = (env<<4) + sin_tab[wave_tab + ((((signed int)((phase & ~FREQ_MASK) + pm))>>FREQ_SH) & SIN_MASK)];
-+ p = (env<<4) + sin_tab[wave_tab + (pos & SIN_MASK)];
-
-- if (p >= TL_TAB_LEN)
-- return 0;
-+#if 1
-+ return tl_tab[p&(TL_TAB_LEN/13-1)] >> (p/(TL_TAB_LEN/13));
-+#else
-+ if (p > TL_TAB_LEN) return 0;
- return tl_tab[p];
-+#endif
- }
-
-
-diff -ruN src./dosbox.cpp src/dosbox.cpp
---- dupa/src./dosbox.cpp 2004-09-30 15:15:59.000000000 +0200
-+++ dupa/src/dosbox.cpp 2004-09-30 15:18:48.301932384 +0200
-@@ -231,11 +231,17 @@
- secprop->Add_int("frameskip",0);
- secprop->Add_bool("aspect",false);
- secprop->Add_string("scaler","normal2x");
-+ secprop->Add_int("hq2x_threshold_adaptive",75);
-+ secprop->Add_int("hq2x_threshold",0);
- MSG_Add("RENDER_CONFIGFILE_HELP",
- "frameskip -- How many frames dosbox skips before drawing one.\n"
- "aspect -- Do aspect correction.\n"
- "scaler -- Scaler used to enlarge/enhance low resolution modes.\n"
-- " Supported are none,normal2x,advmame2x,advmame3x,advinterp2x,interp2x,tv2x.\n"
-+ " Supported are none,normal2x,advmame2x,advmame3x,advinterp2x,interp2x,tv2x,hq2x.\n"
-+ "hq2x_threshold_adaptive -- The adaptive threshold used to detect edges in hq2x\n"
-+ " Possible values are 0-100, can be modified with Ctrl+Alt+F5/F6\n"
-+ "hq2x_threshold -- The static threshold used to detect edges in hq2x\n"
-+ " Possible values are 0-255, can be modified with Ctrl+Alt+F3/F4\n"
- );
-
- secprop=control->AddSection_prop("cpu",&CPU_Init);