--- /dev/null
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/Makefile.am dosbox-0.61+hq2x/src/gui/Makefile.am
+--- dosbox-0.61/src/gui/Makefile.am 2004-07-05 02:44:22.000000000 +0200
++++ dosbox-0.61+hq2x/src/gui/Makefile.am 2004-07-04 23:25:07.000000000 +0200
+@@ -3,5 +3,6 @@
+ noinst_LIBRARIES = libgui.a
+ libgui_a_SOURCES = sdlmain.cpp sdl_mapper.cpp \
+ render.cpp render_scalers.cpp render_scalers.h render_templates.h \
+- midi.cpp midi_win32.h midi_oss.h midi_coreaudio.h midi_alsa.h
++ midi.cpp midi_win32.h midi_oss.h midi_coreaudio.h midi_alsa.h \
++ render_hq2x.cpp render_hq2x.h
+
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render.cpp dosbox-0.61+hq2x/src/gui/render.cpp
+--- dosbox-0.61/src/gui/render.cpp 2004-08-05 00:12:58.732847304 +0200
++++ dosbox-0.61+hq2x/src/gui/render.cpp 2004-08-04 23:50:12.000000000 +0200
+@@ -33,6 +33,7 @@
+ #include "support.h"
+
+ #include "render_scalers.h"
++#include "render_hq2x.h"
+
+ struct PalData {
+ struct {
+@@ -190,6 +191,9 @@
+ }
+ break;
+ }
++ if (render.op.type == OP_Hq2x) {
++ Hq2x_InitLUTs((void*)render.pal.rgb,render.pal.last,render.pal.first);
++ }
+ /* Setup pal index to startup values */
+ render.pal.first=256;
+ render.pal.last=0;
+@@ -314,6 +318,7 @@
+ case OP_Interp2x:block=&Interp2x_8;break;
+ case OP_AdvInterp2x:block=&AdvInterp2x_8;break;
+ case OP_TV2x:block=&TV2x_8;break;
++ case OP_Hq2x:block=&Hq2x_8;break;
+ }
+ gfx_flags=GFX_GetBestMode(block->flags);
+ if (!gfx_flags) {
+@@ -362,7 +367,7 @@
+
+ extern void GFX_SetTitle(Bits cycles, Bits frameskip,bool paused);
+ static void IncreaseFrameSkip(void) {
+- if (render.frameskip.max<10) render.frameskip.max++;
++ if (render.frameskip.max<25) render.frameskip.max++;
+ LOG_MSG("Frame Skip at %d",render.frameskip.max);
+ GFX_SetTitle(-1,render.frameskip.max,false);
+ }
+@@ -376,6 +381,12 @@
+ void RENDER_Init(Section * sec) {
+ Section_prop * section=static_cast<Section_prop *>(sec);
+
++ Hq2x_colourTrigger=section->Get_int("hq2x_threshold");
++ if (Hq2x_colourTrigger > 255) Hq2x_colourTrigger = 255;
++ if (Hq2x_colourTrigger < 0) Hq2x_colourTrigger = 0;
++ Hq2x_colourTrigger_adaptive=section->Get_int("hq2x_threshold_adaptive");
++ if (Hq2x_colourTrigger_adaptive > 255) Hq2x_colourTrigger_adaptive = 255;
++ if (Hq2x_colourTrigger_adaptive <= 0) Hq2x_colourTrigger_adaptive = 75;
+ render.pal.first=256;
+ render.pal.last=0;
+ render.aspect=section->Get_bool("aspect");
+@@ -398,12 +409,17 @@
+ else if (!strcasecmp(scaler,"advinterp2x")) render.op.want_type=OP_AdvInterp2x;
+ else if (!strcasecmp(scaler,"interp2x")) render.op.want_type=OP_Interp2x;
+ else if (!strcasecmp(scaler,"tv2x")) render.op.want_type=OP_TV2x;
++ else if (!strcasecmp(scaler,"hq2x")) render.op.want_type=OP_Hq2x;
+ else {
+ render.op.want_type=OP_Normal;
+ LOG_MSG("Illegal scaler type %s,falling back to normal.",scaler);
+ }
+ MAPPER_AddHandler(DecreaseFrameSkip,MK_f7,MMOD1,"decfskip","Dec Fskip");
+ MAPPER_AddHandler(IncreaseFrameSkip,MK_f8,MMOD1,"incfskip","Inc Fskip");
++ MAPPER_AddHandler(Hq2x_DecreaseThreshold,MK_f3,MMOD1|MMOD2,"dechq2xthreshold","Dec Hq2x Static Threshold");
++ MAPPER_AddHandler(Hq2x_IncreaseThreshold,MK_f4,MMOD1|MMOD2,"inchq2xthreshold","Inc Hq2x Static Threshold");
++ MAPPER_AddHandler(Hq2x_DecreaseThresholdAdaptive,MK_f5,MMOD1|MMOD2,"dechq2xadapthreshold","Dec Hq2x Adaptive Threshold");
++ MAPPER_AddHandler(Hq2x_IncreaseThresholdAdaptive,MK_f6,MMOD1|MMOD2,"inchq2xadapthreshold","Inc Hq2x Adaptive Threshold");
+ GFX_SetTitle(-1,render.frameskip.max,false);
+ }
+
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x.cpp dosbox-0.61+hq2x/src/gui/render_hq2x.cpp
+--- dosbox-0.61/src/gui/render_hq2x.cpp 1970-01-01 01:00:00.000000000 +0100
++++ dosbox-0.61+hq2x/src/gui/render_hq2x.cpp 2004-08-04 23:43:53.000000000 +0200
+@@ -0,0 +1,799 @@
++//hq2x filter demo program
++//----------------------------------------------------------
++//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
++// Speed optimization and mmx code Copyright (c) 2004 Jörg Walter (jwalt@garni.ch)
++
++//This program is free software; you can redistribute it and/or
++//modify it under the terms of the GNU Lesser General Public
++//License as published by the Free Software Foundation; either
++//version 2.1 of the License, or (at your option) any later version.
++//
++//This program is distributed in the hope that it will be useful,
++//but WITHOUT ANY WARRANTY; without even the implied warranty of
++//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++//Lesser General Public License for more details.
++//
++//You should have received a copy of the GNU Lesser General Public
++//License along with this program; if not, write to the Free Software
++//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++/*
++ This code comes in three variants:
++ 1. plain C code with live difference calculation
++ 2. C code with live difference calculation in MMX
++ 3. lookup-table based difference calculation
++
++ Which one is fastest depends on your CPU speed and cache size. The table based algorithm
++ should be fastest if you have 32kb L1 data cache or more. Packing diff values into less
++ bytes is possible, define DIFF_TABLE to the number of bits per int.
++
++ Speed: 22fps/27fps(MMX)/32fps(table) on a pentium2/333MHz
++
++ TODO:
++ Currently only does 32bpp/16bpp BGRA output, and (theoretically) RGB output. YUV
++ isn't needed anymore, it seems, so this code should now work in all setups.
++ MMX code only does RGB, thus isn't really usable (but useful for benchmarking).
++ This code should use the intel compiler functions for mmx, as GCC emulates
++ them more or less completely.
++
++ further optimization ideas:
++ - fix gcc bugs (shift), so Diff_mmx can run without register spilling
++ - manual unrolling of Diff loop to get decent memory prefetch for
++ recent CPUs
++ - add mmxext support to Diff for faster unpacking
++ - test if sse's movntq in interpolation loop improves things
++ - find a way to mmxify the interpolation loop sensibly
++ (currently runs slower than non-mmx code)
++ - find a way to save (cache-)memory in the factors table
++ (tighter packing and double indirection are both slower on p2)
++ - find a way for 16bpp not to suck that hard (speed-wise)
++*/
++
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <png.h>
++#include <math.h>
++
++#include "config.h"
++#include "dosbox.h"
++#include "video.h"
++#include "render_scalers.h"
++#include "render_hq2x.h"
++
++#ifndef __GCC__
++#define __attribute__(x)
++#define __builtin_expect(x,y) x
++#endif
++
++#define ADAPTIVE
++/* #define DEBUG */
++// #define DIFF_TABLE 32
++#define DIFF_TABLE 1
++
++// Gathered experimentally, values from 0x08-0x80 are useful, depending on graphics
++// and your personal preference.
++long Hq2x_colourTrigger = 0;
++long Hq2x_colourTrigger_adaptive = 75;
++
++#ifdef ADAPTIVE
++#undef DIFF_TABLE
++#define DIFF_TABLE 1
++#endif
++
++#ifdef DIFF_TABLE
++#define bits DIFF_TABLE
++static
++#if DIFF_TABLE == 1
++unsigned char
++#else
++int
++#endif
++difftable[65536/bits];
++
++inline static unsigned int Diff1_calc(int r1, int g1, int b1, int r2, int g2, int b2)
++{
++ long r,g,b;
++ long rmean;
++ long ret;
++
++ rmean = r1+r2;
++ b = b1-b2;
++ g = g1-g2;
++ r = r1-r2;
++
++#ifdef ADAPTIVE
++ ret = (unsigned int)(((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g);
++ return (ret < Hq2x_colourTrigger?0:ret-Hq2x_colourTrigger > 255?255:ret-Hq2x_colourTrigger);
++#else
++ return ((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g > Hq2x_colourTrigger;
++#endif
++}
++
++#if DIFF_TABLE == 1
++#define Diff1(x,y) (difftable[(*(x)) * (256/bits) + ((*(y)) / bits)] >> ((*(y))%bits))
++#else
++#define Diff1(x,y) ((difftable[(*(x)) * (256/bits) + ((*(y)) / bits)] >> ((*(y))%bits)) & 1)
++#endif
++inline static int Diff(const unsigned char *l2, const unsigned char *l3)
++{
++#ifdef ADAPTIVE
++ unsigned int max = Diff1(l2,l3+1);
++ unsigned int min = max;
++ unsigned int dynthres = Diff1(l2+1,l3);
++ if (dynthres > max) max = dynthres;
++ if (dynthres < min) min = dynthres;
++ dynthres = Diff1(l2+1,l3+1);
++ if (dynthres > max) max = dynthres;
++ if (dynthres < min) min = dynthres;
++ dynthres = Diff1(l3,l3+1);
++ if (dynthres > max) max = dynthres;
++ if (dynthres < min) min = dynthres;
++ dynthres = (Hq2x_colourTrigger_adaptive*max+(100-Hq2x_colourTrigger_adaptive)*min)/200;
++
++ return ((Diff1(l2,l3+1)>dynthres)*0x00aa0055) | ((Diff1(l2+1,l3)>dynthres)*0x005500aa) | ((Diff1(l2+1,l3+1)>dynthres)*0x03000300) | ((Diff1(l3,l3+1)>dynthres)*0x0c000c00);
++#else
++ return (Diff1(l2,l3+1)*0x00aa0055) | (Diff1(l2+1,l3)*0x005500aa) | (Diff1(l2+1,l3+1)*0x03000300) | (Diff1(l3,l3+1)*0x0c000c00);
++#endif
++}
++#undef __MMX__
++#else
++#ifdef __MMX__
++/* always on for gcc for now */
++#define MMX_ONLY
++/* this is safe for -march=..., but not if someone specifies -mmmx manually */
++
++# ifdef MMX_ONLY
++# define Diff_mmx Diff
++# else
++ int has_mmx = 0;
++# define Diff(a,b) (has_mmx?Diff_mmx((a),(b)):Diff_any((a),(b)))
++# endif
++#else
++# define Diff_any Diff
++#endif
++
++// A better colour distance function, adapted from http://www.compuphase.com/cmetric.htm
++#if !defined(MMX_ONLY) || defined(DEBUG)
++
++inline static int Diff1(const unsigned char *e1, const unsigned char* e2)
++{
++ long r,g,b;
++ long rmean;
++
++ rmean = e1[0]+e2[0];
++ b = e1[0]-e2[0];
++ g = e1[1]-e2[1];
++ r = e1[2]-e2[2];
++
++ return ((128+rmean)*r*r + (192-rmean)*b*b)/256 + g*g > Hq2x_colourTrigger;
++}
++
++inline static int Diff_any(const unsigned long *l2, const unsigned long *l3)
++{
++ return (Diff1((unsigned char *)l2,(unsigned char *)(l3+1))*0x00aa0055) | (Diff1((unsigned char *)(l2+1),(unsigned char *)l3)*0x005500aa) | (Diff1((unsigned char *)(l2+1),(unsigned char *)(l3+1))*0x03000300) | (Diff1((unsigned char *)l3,(unsigned char *)(l3+1))*0x0c000c00);
++}
++#endif
++
++#ifdef __MMX__
++typedef int mmx_1_64 __attribute__((mode(DI)));
++typedef int mmx_2_32 __attribute__((mode(V2SI)));
++typedef int mmx_4_16 __attribute__((mode(V4HI)));
++typedef int mmx_8_8 __attribute__((mode(V8QI)));
++
++static mmx_4_16 mmx_trigger;
++
++/* Note: this needs BGRA pixel layout, with the A component replaced by (-R)+32 */
++inline static int Diff_mmx(const unsigned long *e1, const unsigned long *e2)
++{
++ mmx_4_16 mm0, mm1, mm2, mm3, mm4;
++#ifdef DEBUG
++ mmx_4_16 t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17,t18, m0, m1;
++ #define d(x) x =
++#else
++#define d(x)
++#endif
++ const mmx_8_8 zero = (mmx_8_8)0x0ULL;
++ const mmx_4_16 rmean_off = (mmx_4_16)(0x0a000a000a000a00ULL);
++ const mmx_4_16 factors = (mmx_4_16)0xfffdfff4aa5655abULL;
++
++ /* -1 * aa56 = 0101 0101 1010 1010 */
++ /* -1 * 55ab = 1010 1010 0101 0101 */
++
++ /*
++ Read from memory:
++ */
++ mm1 = *(mmx_4_16 *)e2;
++ mm0 = *(mmx_4_16 *)e1;
++
++ /* (high ................................ low)
++ -p2r+32, p2b, p2g, p2r, -p1r+32, p1b, p1g, p1r = mm0
++ -p5r+32, p5b, p5g, p5r, -p4r+32, p4b, p4g, p4r = mm1
++
++ Shuffle dwords so we get 4 registers with pixel
++ arrangement ready for difference calculation:
++
++ (2, 5, 4, 1) - (5, 4, 2, 5)
++
++ We choose (rrrr, gggg) + (bbbb, rrrr) layout. This
++ is quite expensive, given that difference calculation
++ in (rgbr, rgbr) form would need just two unpacks, but
++ the unpacking has to be done sooner or later, and
++ this pixel layout makes later calculations cheaper.
++ TODO: sse/mmxext version of this unpacking should be
++ much cheaper.
++
++ -p4r+32, -p1r+32, p4b, p1b, p4g, p1g, p4r, p1r = mm0
++ -p5r+32, p5b, p5g, p5r, -p5r+32, p5b, p5g, p5r = mm3 (temp)
++ -p2r+32, -p5r+32, p2b, p5b, p2g, p5g, p2r, p5r = mm2
++ -p5r+32, -p4r+32, p5b, p4b, p5g, p4g, p5r, p4r = mm1
++*/
++
++ mm3 = (mmx_4_16)__builtin_ia32_punpckhdq((mmx_2_32)mm1,(mmx_2_32)mm1);
++ mm2 = (mmx_4_16)__builtin_ia32_punpckhbw((mmx_8_8)mm1,(mmx_8_8)mm0);
++ mm0 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm0,(mmx_8_8)mm1);
++ mm1 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm1,(mmx_8_8)mm3);
++
++/*
++ ... continued ...
++
++ -p2r+32, -p5r+32, -p4r+32, -p1r+32, p2b, p5b, p4b, p1b = mm0
++ p2g, p5g, p4g, p1g, p2r, p5r, p4r, p1r = mm3
++
++ -p5r+32, -p4r+32, -p2r+32, -p5r+32, p5b, p4b, p2b, p5b = mm4
++ p5g, p4g, p2g, p5g, p5r, p4r, p2r, p5r = mm2
++
++*/
++
++ mm3 = (mmx_4_16)__builtin_ia32_punpcklwd(mm0,mm2);
++ mm0 = (mmx_4_16)__builtin_ia32_punpckhwd(mm0,mm2);
++ mm4 = (mmx_4_16)__builtin_ia32_punpckhwd(mm2,mm1);
++ mm2 = (mmx_4_16)__builtin_ia32_punpcklwd(mm2,mm1);
++
++ /*
++ Put mm2 with negated red component into mm1. Negation is done
++ in the lookup table.
++
++ -p2r+32, -p5r+32, -p4r+32, -p1r+32, p2b, p5b, p4b, p1b = mm0
++ p2g, p5g, p4g, p1g, p2r, p5r, p4r, p1r = mm3
++
++ -p5r+32, -p4r+32, -p2r+32, -p5r+32, p5b, p4b, p2b, p5b = mm4
++ p5g, p4g, p2g, p5g, -p5r+32, -p4r+32, -p2r+32, -p5r+32 = mm2
++
++ */
++
++ mm2 = (mmx_4_16)__builtin_ia32_punpckhdq((mmx_2_32)mm4,(mmx_2_32)mm2);
++
++ /*
++
++ Calculate the differences (and rmean)
++ mm0-mm4, mm3-mm2 (signed saturation)
++
++ d1r/8, d3r/8, d8r/8, d0r/8, d1b/8, d3b/8, d8b/8, d0b/8 = mm0
++ d1g/8, d3g/8, d8g/8, d0g/8, d1rmean/4-32, d3rmean/4-32, d8rmean/4-32, d0rmean/4-32 = mm3
++
++ */
++
++ mm0 = (mmx_4_16)__builtin_ia32_psubsb((mmx_8_8)mm0,(mmx_8_8)mm4);
++ mm3 = (mmx_4_16)__builtin_ia32_psubsb((mmx_8_8)mm3,(mmx_8_8)mm2);
++
++#ifdef DEBUG
++{
++ int p1 = e1[0];
++ int p5 = e2[1];
++ char *cmm0 = (void*)&mm0, *cmm3 = (void*)&mm3;
++ if (((int)cmm3[0]) != ((int)(p1&Rmask) + (int)(p5&Rmask) - 32)) abort();
++ if (((int)cmm0[4]) != -((int)(p1&Rmask) - (int)(p5&Rmask))) abort();
++}
++#endif
++
++ /* Intermediate stats:
++
++ (rough) code equivalent:
++ rmean = (((int)(e1&Rmask) + (int)(e2&Rmask)) >> 16) - 32;
++ r = ((int)(e2&Rmask) - (int)(e1&Rmask)) >> 16;
++ g = ((int)(e1&Gmask) - (int)(e2&Gmask)) >> 8;
++ b = ((int)(e1&Bmask) - (int)(e2&Bmask));
++
++ Gain:
++ 1 distance w/o mmx = 16 ops
++ 4 distances w/ mmx = 11 ops
++ (possible parallelism left to the compiler)
++
++ Todo:
++ ((160+rmean)*r/8*r/8) + 256*g/8*g/8 + ((160-rmean)*b/8*b/8)
++
++ (slightly incorrect: the result is the true difference plus (b/8)^2, but
++ this eliminates a constant, making the algorithm fit into the available
++ 8 registers)
++
++ d1r/8, d3r/8, d8r/8, d0r/8, d1b/8, d3b/8, d8b/8, d0b/8 = mm0
++ d1g/8, d3g/8, d8g/8, d0g/8, d1rmean/4-32, d3rmean/4-32, d8rmean/4-32, d0rmean/4-32 = mm3
++ */
++
++ /*
++ prepare differences for final calculation:
++
++ 00 d1r/2 00 d3r/2 00 d8r/2 00 d0r/2 = mm0
++ 00 d1b/2 00 d3b/2 00 d8b/2 00 d0b/2 = mm1
++ 00 d1g/8 00 d3g/8 00 d8g/8 00 d0g/8 = mm2
++ 00 (d1rmean/4-32)*16 00 (d3rmean/4-32)*16 00 (d8rmean/4-32)*16 00 (d0rmean/4-32)*16 = mm3
++
++ */
++
++#ifdef DEBUG
++ m0 = mm0;
++ m1 = mm3;
++#endif
++ // TODO: compiler error at __builtin_ia32_psllb(mm0,2);
++ d(t0) mm0 = __builtin_ia32_pmullw(mm0,(mmx_4_16)(0x0004000400040004ULL));
++ d(t1) mm1 = (mmx_4_16)__builtin_ia32_punpcklbw(zero, (mmx_8_8)mm0);
++ d(t2) mm0 = (mmx_4_16)__builtin_ia32_punpckhbw(zero, (mmx_8_8)mm0);
++ d(t3) mm2 = (mmx_4_16)__builtin_ia32_punpckhbw(zero, (mmx_8_8)mm3);
++ d(t4) mm3 = (mmx_4_16)__builtin_ia32_punpcklbw(zero, (mmx_8_8)mm3);
++ // TODO: compiler error at __builtin_ia32_psraw(mm3,4);
++ d(t5) mm3 = __builtin_ia32_pmulhw(mm3,(mmx_4_16)(0x1000100010001000ULL));
++
++ /*
++ intermediate results: squares and rmean factors
++
++ 00 (d1r/2)^2 00 (d3r/2)^2 00 (d8r/2)^2 00 (d0r/2)^2 = mm0
++ 00 (d1b/2)^2 00 (d3b/2)^2 00 (d8b/2)^2 00 (d0b/2)^2 = mm1
++ 00 256*(d1g/8)^2 00 256*(d3g/8)^2 00 256*(d8g/8)^2 00 256*(d0g/8)^2 = mm2
++ 00 128+d1rmean/4 00 128+d3rmean/4 00 128+d8rmean/4 00 128+d0rmean/4 = mm3
++ 00 192-d1rmean/4 00 192-d3rmean/4 00 192-d8rmean/4 00 192-d0rmean/4 = mm4
++
++ */
++
++ d(t9) mm0 = __builtin_ia32_pmulhw(mm0,mm0);
++ d(t10) mm1 = __builtin_ia32_pmulhw(mm1,mm1);
++ d(t11) mm2 = __builtin_ia32_pmulhw(mm2,mm2);
++ d(t12) mm4 = __builtin_ia32_psubsw(rmean_off,mm3);
++ d(t13) mm3 = __builtin_ia32_paddsw(mm3,rmean_off);
++
++ /*
++ intermediate results: finish red and blue components
++
++ 00 (128+d1rmean/4)*(d1r/8)^2 00 (128+d3rmean/4)*(d3r/8)^2 00 (128+d8rmean/4)*(d8r/8)^2 00 (128+d0rmean/4)*(d0r/8)^2 = mm0
++ 00 (192-d1rmean/4)*(d1b/8)^2 00 (192-d3rmean/4)*(d3b/8)^2 00 (192-d8rmean/4)*(d8b/8)^2 00 (192-d0rmean/4)*(d0b/8)^2 = mm1
++ 00 (d1g/8)^2 00 (d3g/8)^2 00 (d8g/8)^2 00 (d0g/8)^2 = mm2
++
++ */
++
++ d(t15) mm1 = __builtin_ia32_pmulhw(mm1, mm4);
++ d(t16) mm0 = __builtin_ia32_pmulhw(mm0, mm3);
++
++ /*
++ calculate final visual difference
++
++ (128+rmean/4)*(r/8)^2+(192-rmean/4)*(b/8)^2+256*(g/8)^2 = mm0 (order: 1 3 8 0)
++ */
++
++ d(t17) mm0 = __builtin_ia32_paddw(mm0,mm1);
++ d(t18) mm0 = __builtin_ia32_paddw(mm0,mm2);
++
++#ifdef DEBUG
++{
++ int p1 = e1[0];
++ int p5 = e2[1];
++ short *smm0 = (void*)&mm0;
++ long r,g,b;
++ long rmean, diff;
++ rmean = (((p1+p5)&Rmask)-32);
++ r = (p1&Rmask)-(p5&Rmask);
++ g = ((p1&Gmask) - (p5&Gmask)) >> 8;
++ b = ((p1&Bmask) - (p5&Bmask)) >> 16;
++
++ diff = ((160+rmean)*r*r + 256*g*g + (160-rmean)*b*b)/256;
++ if (diff > smm0[0]+1 || diff < smm0[0]-1) abort();
++}
++#endif
++
++ /*
++ Code equivalent:
++ ((((512+rmean)>>8)*r*r) + 4*g*g + (((768-rmean)>>8)*b*b))
++
++ test against threshold
++
++ (diff1?0xffff:0x0000) (diff3?0xffff:0x0000) (diff8?0xffff:0x0000) (diff0?0x0xffff:0x0000) = mm0
++ */
++
++ mm0 = __builtin_ia32_pcmpgtw(mm0,mmx_trigger);
++
++ /*
++ create final bit patterns
++
++ 0000 0000 (diff1*0x03000300)|(diff3*0x0c000c00)|(diff8*0x005500aa)|(diff0*0x00aa0055)
++
++ */
++
++ mm0 = (mmx_4_16)__builtin_ia32_pmaddwd(mm0,factors);
++ mm0 = (mmx_4_16)__builtin_ia32_punpcklbw((mmx_8_8)mm0,(mmx_8_8)__builtin_ia32_punpckhbw((mmx_8_8)mm0,(mmx_8_8)mm0));
++
++ return (unsigned long)(unsigned long long)mm0;
++ /*
++ Total: 11+16+3 = 30 ops for 4 distances vs. 16+13+7 = 36 ops for 1 distance
++ */
++}
++#endif
++#endif
++
++static int LUTPAL8to32[256] __attribute__((aligned(32)));
++//#define factors(a,b,c,d) (((a)-1) | ((b)<<2) | ((c)<<4) | ((d)<<6))
++#define P0 {8,0,0,0}
++#define P10 {6,2,0,0}
++#define P11 {6,0,0,2}
++#define P12 {6,0,2,0}
++#define P20 {4,0,2,2}
++#define P21 {4,2,2,0}
++#define P22 {4,2,0,2}
++#define P60 {5,0,2,1}
++#define P61 {5,0,1,2}
++#define P70 {6,0,1,1}
++#define P90 {2,0,3,3}
++#define P100 {7,0,0,1}
++#define X {0,0,0,0}
++#define UNUSED X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, \
++ X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X, X,X,X,X,X,X,X,X
++
++/* sparse table: only 2k entries are used */
++static unsigned char factors[4096][4] __attribute__((aligned(32))) = {
++/* 0000 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0040 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0080 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 00c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0100 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0140 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0180 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 01c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0200 */ UNUSED,
++/* 0400 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0440 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0480 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 04c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0500 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0540 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0580 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 05c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0600 */ UNUSED,
++/* 0800 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0840 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0880 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 08c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0900 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0940 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0980 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 09c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 0a00 */ UNUSED,
++/* 0c00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0c40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0c80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0cc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0d00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0d40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0d80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0dc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 0e00 */ UNUSED,
++/* 1000 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1040 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1080 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 10c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1100 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1140 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1180 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 11c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1200 */ UNUSED,
++/* 1400 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
++/* 1440 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
++/* 1480 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
++/* 14c0 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
++/* 1500 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
++/* 1540 */ P20, P20, P10, P0, P20, P20, P10, P0, P90, P90, P10, P0, P90, P90, P10, P0,
++/* 1580 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
++/* 15c0 */ P90, P90, P10, P0, P90, P90, P10, P0, P70, P100,P10, P0, P70, P100,P10, P0,
++/* 1600 */ UNUSED,
++/* 1800 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
++/* 1840 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
++/* 1880 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
++/* 18c0 */ P21, P61, P21, P61, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
++/* 1900 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1940 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1980 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 19c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 1a00 */ UNUSED,
++/* 1c00 */ P20, P20, P0, P0, P20, P20, P0, P0, P10, P20, P10, P0, P10, P20, P10, P0,
++/* 1c40 */ P20, P20, P0, P0, P20, P20, P0, P0, P10, P20, P10, P0, P10, P20, P10, P0,
++/* 1c80 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 1cc0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 1d00 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
++/* 1d40 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
++/* 1d80 */ P70, P90, P10, P0, P70, P90, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 1dc0 */ P70, P90, P10, P0, P70, P90, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 1e00 */ UNUSED,
++/* 2000 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2040 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2080 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 20c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2100 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2140 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2180 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 21c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2200 */ UNUSED,
++/* 2400 */ P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, /* above */
++/* 2440 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2480 */ P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60, P22, P60,
++/* 24c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2500 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2540 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2580 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 25c0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2600 */ UNUSED,
++/* 2800 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2840 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2880 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 28c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2900 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2940 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2980 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 29c0 */ P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20, P20,
++/* 2a00 */ UNUSED,
++/* 2c00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, /* rot */
++/* 2c40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2c80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P60, P22, P60, P22, P60, P22, P60,
++/* 2cc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2d00 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2d40 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2d80 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2dc0 */ P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11, P22, P11,
++/* 2e00 */ UNUSED,
++/* 3000 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3040 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3080 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 30c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3100 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3140 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3180 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 31c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3200 */ UNUSED,
++/* 3400 */ P20, P20, P0, P0, P20, P20, P0, P0, P70, P20, P10, P0, P70, P20, P10, P0,
++/* 3440 */ P20, P20, P0, P0, P20, P20, P0, P0, P70, P20, P10, P0, P70, P20, P10, P0,
++/* 3480 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 34c0 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 3500 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P90, P10, P0, P70, P90, P10, P0,
++/* 3540 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P90, P10, P0, P70, P90, P10, P0,
++/* 3580 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 35c0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 3600 */ UNUSED,
++/* 3800 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3840 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3880 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
++/* 38c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P61, P21, P61, P21, P12, P21, P12,
++/* 3900 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3940 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3980 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 39c0 */ P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12, P21, P12,
++/* 3a00 */ UNUSED,
++/* 3c00 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
++/* 3c40 */ P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0, P70, P20, P10, P0,
++/* 3c80 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 3cc0 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 3d00 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P20, P10, P0, P10, P20, P10, P0,
++/* 3d40 */ P70, P20, P10, P0, P70, P20, P10, P0, P10, P20, P10, P0, P10, P20, P10, P0,
++/* 3d80 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 3dc0 */ P10, P20, P10, P0, P10, P20, P10, P0, P10, P100,P10, P0, P10, P100,P10, P0,
++/* 3e00 */ UNUSED
++};
++
++/* Memory usage at 320 pixels width:
++
++ ~4k line buffer
++ ~2.5k pattern buffer
++ 8k factor table (can be packed into as less as 2k, but unpacking takes more time than we lose due
++ to cache trashing, at least on a pentium2)
++ 1k palette table
++------
++~15.5k data
++
++If the diff table is used, another 8k (packed) / 64k (unpacked) are used.
++*/
++
++/* Optimizations:
++
++Pixel/Pattern layout:
++
++1 2 3
++ a c
++4 5 6
++ b d
++7 8 9
++
++Factor storage:
++Pixel: 5 1 2 4
++
++ 0: 8 0 0 0
++ 10: 6 2 0 0
++ 11: 6 0 0 2
++ 12: 6 0 2 0
++ 20: 4 0 2 2
++ 21: 4 2 2 0
++ 22: 4 2 0 2
++ 60: 5 0 2 1
++ 61: 5 0 1 2
++ 70: 6 0 1 1
++ 90: 2 0 3 3
++100: 7 0 1 0
++
++bits: 3 1 2 2 = 8 bit
++
++Factor set 100 is wrong: 7 0 0.5 0.5 would be 100% like the original, but
++since pixel 2 and 4 are visually close for all patterns where this set is
++used, this important simplification should not be visible.
++
++Pattern usage:
++
++a b c d
++0 8 1 3 10 5 6 (a) 9 2 (b) 4 7 (11) (c) (d) -> 1 2 4 5
++
++0 8 10 5 9 2 7 x 1 6 3 4
++
++cr+a2 dr+b2 ar br
++2 9 1 4 11 7 6 (d) 8 0 (x) 3 5 (10) (y) (a) -> 3 2 6 5
++
++br+a3 ar dr+c3 cr
++5 10 6 3 8 0 1 (m) 11 7 (c) 4 2 (9) (b) (n) -> 7 8 4 5
++
++d+b2+c3 c+a2 b+a3 a
++7 11 6 4 9 2 1 (n) 10 5 (y) 3 0 (8) (x) (m) -> 9 8 6 5
++
++
++Pattern storage:
++
++0123 = \/|_
++
++0101 010x 2323
++1010 101x 2323
++
++*/
++
++union pattern {
++ unsigned short p[2];
++ unsigned long value;
++};
++
++static unsigned long lines0[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
++static unsigned long lines1[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
++static unsigned long lines2[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
++static unsigned long *l1, *l2, *l3, *tmp;
++static union pattern p0[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
++static union pattern p1[Hq2x_MAXWIDTH+2] __attribute__((aligned(32)));
++static union pattern *top, *bot, *ptmp;
++static unsigned char prev[Hq2x_MAXWIDTH+2];
++
++#ifdef DIFF_TABLE
++#define diffcall bot[i].value = Diff(prev+i,pIn+i); memcpy(prev,pIn,Scaler_SrcWidth+1);
++#else
++#define diffcall bot[i].value = Diff(l2+i,l3+i);
++#endif
++
++#ifndef __MMX__
++#define __builtin_ia32_emms()
++#endif
++
++#define CONSTCHECK if (Scaler_SrcWidth == 320) RENDER_DrawLine =
++
++#define store(out,index,x,y) do{((unsigned long*)out)[index*2] = 0xff000000|(x); ((unsigned long*)out)[index*2+1] = 0xff000000|(y);}while(0)
++#define type long
++
++#define FUNC Hq2x_long_320_line
++#define Scaler_SrcWidth 320
++#define CHECK_CONST
++#include "render_hq2x_template.h"
++#undef CHECK_CONST
++#undef Scaler_SrcWidth
++#undef FUNC
++
++#define FUNC Hq2x_long_Scaler_SrcWidth_line
++#define CHECK_CONST CONSTCHECK Hq2x_long_320_line;
++#include "render_hq2x_template.h"
++#undef store
++#undef type
++#undef CHECK_CONST
++#undef FUNC
++
++/* 16 bit support */
++#ifdef WORDS_BIGENDIAN
++#define store(out,index,y,x) ((unsigned long *)out)[index] = (((((x)>>3)&0x1f)|(((y)<<13))&0x1f0000)|((((x)>>5)&0x7e0)|(((y)<<11))&0x7e00000)|((((x)>>8)&0xf800)|(((y)<<8))&0xf8000000))
++#else
++#define store(out,index,x,y) ((unsigned long *)out)[index] = (((((x)>>3)&0x1f)|(((y)<<13))&0x1f0000)|((((x)>>5)&0x7e0)|(((y)<<11))&0x7e00000)|((((x)>>8)&0xf800)|(((y)<<8))&0xf8000000))
++#endif
++#define type short
++
++#define FUNC Hq2x_short_320_line
++#define Scaler_SrcWidth 320
++#define CHECK_CONST
++#include "render_hq2x_template.h"
++#undef CHECK_CONST
++#undef Scaler_SrcWidth
++#undef FUNC
++
++#define FUNC Hq2x_short_Scaler_SrcWidth_line
++#define CHECK_CONST CONSTCHECK Hq2x_short_320_line;
++#include "render_hq2x_template.h"
++#undef store
++#undef type
++#undef CHECK_CONST
++#undef FUNC
++
++ScalerBlock Hq2x_8={
++ CAN_16|CAN_32|LOVE_32|NEED_RGB,
++ 2,2,1,
++ 0,Hq2x_short_Scaler_SrcWidth_line,Hq2x_short_Scaler_SrcWidth_line,Hq2x_long_Scaler_SrcWidth_line
++};
++
++void Hq2x_InitLUTs(const void *pal, int palette_end, int palette_start)
++{
++ int i, j;
++ struct GFX_PalEntry *palette = (struct GFX_PalEntry *)pal;
++
++ // All componets are reduced to 5 bit (VGA palette has 6 bit)
++ // for simpler multiplication and storage (divided by 8)
++ for (i=palette_start; i<=palette_end; i++) {
++ // 5 significant bits with 3 bit multiplier fit into 8 bit, thus
++ // plain int multiplication can be used without tricks
++ // R is duplicated into A, negated and increased by 32 for some
++ // nice mmx distance calculation tricks
++ LUTPAL8to32[i] = ((palette[i].r&0xf8) << 13) | ((palette[i].g&0xf8) << 5) | ((palette[i].b&0xf8) >> 3) | ((32*8-(palette[i].r&0xf8)) << 21);
++ }
++
++#ifdef DIFF_TABLE
++#if DIFF_TABLE != 1
++ memset(difftable,0,sizeof(difftable));
++#endif
++ for (i = 0; i < 256; i++) {
++ for (j = 0; j < 256; j++) {
++ difftable[(i) * (256/bits) + ((j) / bits)]
++#if DIFF_TABLE == 1
++ =
++#else
++ |=
++#endif
++ Diff1_calc((LUTPAL8to32[i]>>16)&0x1f,(LUTPAL8to32[i]>>8)&0x1f,(LUTPAL8to32[i])&0x1f, (LUTPAL8to32[j]>>16)&0x1f,(LUTPAL8to32[j]>>8)&0x1f,(LUTPAL8to32[j])&0x1f) << (j%bits);
++ }
++ }
++#endif
++
++#ifdef __MMX__
++ *((short *)(&mmx_trigger)) = Hq2x_colourTrigger;
++ *(((short *)(&mmx_trigger))+1) = Hq2x_colourTrigger;
++ *(((short *)(&mmx_trigger))+2) = Hq2x_colourTrigger;
++ *(((short *)(&mmx_trigger))+3) = Hq2x_colourTrigger;
++#endif
++}
++
++void Hq2x_IncreaseThreshold(void)
++{
++ if (Hq2x_colourTrigger < 255) Hq2x_colourTrigger++;
++ Hq2x_InitLUTs(0,0,1);
++ LOG_MSG("Hq2x threshold at %i",Hq2x_colourTrigger);
++}
++
++void Hq2x_DecreaseThreshold(void)
++{
++ if (Hq2x_colourTrigger > 0) Hq2x_colourTrigger--;
++ Hq2x_InitLUTs(0,0,1);
++ LOG_MSG("Hq2x threshold at %i",Hq2x_colourTrigger);
++}
++
++void Hq2x_IncreaseThresholdAdaptive(void)
++{
++ if (Hq2x_colourTrigger_adaptive < 100) Hq2x_colourTrigger_adaptive++;
++ LOG_MSG("Hq2x adaptive threshold at %i",Hq2x_colourTrigger_adaptive);
++}
++
++void Hq2x_DecreaseThresholdAdaptive(void)
++{
++ if (Hq2x_colourTrigger_adaptive > 0) Hq2x_colourTrigger_adaptive--;
++ LOG_MSG("Hq2x adaptive threshold at %i",Hq2x_colourTrigger_adaptive);
++}
++
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x.h dosbox-0.61+hq2x/src/gui/render_hq2x.h
+--- dosbox-0.61/src/gui/render_hq2x.h 1970-01-01 01:00:00.000000000 +0100
++++ dosbox-0.61+hq2x/src/gui/render_hq2x.h 2004-08-02 20:34:46.000000000 +0200
+@@ -0,0 +1,31 @@
++//derived from the hq2x filter demo program
++//----------------------------------------------------------
++//Copyright (C) 2003 MaxSt ( maxst@hiend3d.com )
++// Speed optimization and mmx code Copyright (c) 2004 Jörg Walter (jwalt@garni.ch)
++
++//This program is free software; you can redistribute it and/or
++//modify it under the terms of the GNU Lesser General Public
++//License as published by the Free Software Foundation; either
++//version 2.1 of the License, or (at your option) any later version.
++//
++//This program is distributed in the hope that it will be useful,
++//but WITHOUT ANY WARRANTY; without even the implied warranty of
++//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++//Lesser General Public License for more details.
++//
++//You should have received a copy of the GNU Lesser General Public
++//License along with this program; if not, write to the Free Software
++//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++#ifndef __HQ2X_H
++#define __HQ2X_H
++
++#define Hq2x_MAXWIDTH 640-2
++extern long Hq2x_colourTrigger;
++extern long Hq2x_colourTrigger_adaptive;
++extern void Hq2x_InitLUTs(const void *palette, int palette_end, int palette_start);
++extern void Hq2x_IncreaseThreshold(void);
++extern void Hq2x_DecreaseThreshold(void);
++extern void Hq2x_IncreaseThresholdAdaptive(void);
++extern void Hq2x_DecreaseThresholdAdaptive(void);
++#endif
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_hq2x_template.h dosbox-0.61+hq2x/src/gui/render_hq2x_template.h
+--- dosbox-0.61/src/gui/render_hq2x_template.h 1970-01-01 01:00:00.000000000 +0100
++++ dosbox-0.61+hq2x/src/gui/render_hq2x_template.h 2004-08-02 17:27:01.000000000 +0200
+@@ -0,0 +1,78 @@
++static void FUNC(const unsigned char *pIn)
++{
++ int i, j;
++ unsigned int factor, value1, value2, linesa = (*Scaler_Index++)+1, linesb = linesa/2;
++ linesa -= linesb;
++
++ pIn--;
++ if (__builtin_expect(Scaler_Line++==0,0)) {
++ int i;
++ CHECK_CONST
++
++ Scaler_DstWrite -= 2*sizeof(type);
++ l1 = lines0;
++ l2 = lines1;
++ l3 = lines2;
++ for (i=0; i <= Scaler_SrcWidth+1; i++) l2[i] = 0x20000000;
++ l3[0] = 0x20000000;
++ l3[1] = LUTPAL8to32[pIn[1]];
++
++ for (i=2; i<=Scaler_SrcWidth+1; i++)
++ l3[i] = LUTPAL8to32[pIn[i]];
++
++ top = p0;
++ bot = p1;
++ memcpy(prev,pIn,Scaler_SrcWidth+1);
++ for (i=1; i <= Scaler_SrcWidth; i++) diffcall
++ return;
++ }
++
++ tmp = l1; l1 = l2; l2 = l3; l3 = tmp;
++ ptmp = top; top = bot; bot = ptmp;
++ bot[0].value = 0x07ff07ff;
++
++ l3[0] = 0x20000000;
++ l3[1] = LUTPAL8to32[pIn[1]];
++
++ for (i=2; i<=Scaler_SrcWidth+1; i++)
++ l3[i] = LUTPAL8to32[pIn[i]];
++
++ for (i=1; i<=Scaler_SrcWidth; i++) diffcall
++
++ if (linesa > 0) {
++ for (i=1; i<=Scaler_SrcWidth; i++) {
++ factor = (top[i-1].p[0]&0x503)|(bot[i-1].p[0]&0x20c)|(top[i].p[0]&0x830)|(bot[i].p[0]&0x040);
++ value1 = (l1[i-1]*factors[factor][1]+l1[i]*factors[factor][2]+l2[i-1]*factors[factor][3]+l2[i]*factors[factor][0]);
++
++ factor = (top[i-1].p[1]&0x930)|(bot[i-1].p[1]&0x240)|(top[i].p[1]&0x403)|(bot[i].p[1]&0x00c);
++ value2 = (l1[i+1]*factors[factor][1]+l1[i]*factors[factor][2]+l2[i+1]*factors[factor][3]+l2[i]*factors[factor][0]);
++ store(Scaler_DstWrite,i,value1,value2);
++ }
++ while (--linesa) {
++ memcpy(Scaler_DstWrite+Scaler_DstPitch,Scaler_DstWrite,Scaler_DstPitch);
++ Scaler_DstWrite += Scaler_DstPitch;
++ }
++ Scaler_DstWrite += Scaler_DstPitch;
++ }
++
++ if (linesb > 0) {
++ for (i=1; i <= Scaler_SrcWidth; i++) {
++ factor = (top[i-1].p[1]&0x60c)|(bot[i-1].p[1]&0x103)|(top[i].p[1]&0x840)|(bot[i].p[1]&0x030);
++ value1 = (l3[i-1]*factors[factor][1]+l3[i]*factors[factor][2]+l2[i-1]*factors[factor][3]+l2[i]*factors[factor][0]);
++
++ factor = (top[i-1].p[0]&0xa40)|(bot[i-1].p[0]&0x130)|(top[i].p[0]&0x40c)|(bot[i].p[0]&0x003);
++ value2 = (l3[i+1]*factors[factor][1]+l3[i]*factors[factor][2]+l2[i+1]*factors[factor][3]+l2[i]*factors[factor][0]);
++ store(Scaler_DstWrite,i,value1,value2);
++ }
++ while (--linesb) {
++ memcpy(Scaler_DstWrite+Scaler_DstPitch,Scaler_DstWrite,Scaler_DstPitch);
++ Scaler_DstWrite += Scaler_DstPitch;
++ }
++ Scaler_DstWrite += Scaler_DstPitch;
++ }
++
++ if (__builtin_expect(Scaler_Line==Scaler_SrcHeight,0)) {
++ FUNC(pIn+1);
++ __builtin_ia32_emms();
++ }
++}
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/gui/render_scalers.h dosbox-0.61+hq2x/src/gui/render_scalers.h
+--- dosbox-0.61/src/gui/render_scalers.h 2004-06-10 09:18:19.000000000 +0200
++++ dosbox-0.61+hq2x/src/gui/render_scalers.h 2004-07-04 23:29:49.000000000 +0200
+@@ -30,6 +30,7 @@
+ OP_AdvInterp2x,
+ OP_Interp2x,
+ OP_TV2x,
++ OP_Hq2x,
+ };
+
+ struct ScalerBlock {
+@@ -46,6 +47,7 @@
+ extern ScalerBlock AdvInterp2x_8;
+ extern ScalerBlock Interp2x_8;
+ extern ScalerBlock TV2x_8;
++extern ScalerBlock Hq2x_8;
+
+
+ #endif
+diff -x aclocal.m4 -x CVS -x configure -x '*.in' -x '*~' -x '*.o' -x '*.a' -x Makefile -x config.h -x config.status -x config.log -x 'stamp-h*' -x '*.Po' -x autom4te.cache -x config.guess -x '.#*' -ruN dosbox-0.61/src/hardware/ymf262.c dosbox-0.61+hq2x/src/hardware/ymf262.c
+--- dosbox-0.61/src/hardware/ymf262.c 2004-03-28 15:04:45.000000000 +0200
++++ dosbox-0.61+hq2x/src/hardware/ymf262.c 2004-06-20 03:54:47.000000000 +0200
+@@ -844,23 +844,52 @@
+ INLINE signed int op_calc(UINT32 phase, unsigned int env, signed int pm, unsigned int wave_tab)
+ {
+ UINT32 p;
++ int pos = (((signed int)((phase & ~FREQ_MASK) + (pm<<16))) >> FREQ_SH );
++#ifdef SMALL_CACHE
++ if ((wave_tab == 1*SIN_LEN) && (pos & (SIN_LEN>>1))) pos = 0;
++ if ((wave_tab == 3*SIN_LEN) && (pos & (SIN_LEN>>2))) pos = 0;
++ if (wave_tab == 2*SIN_LEN || wave_tab == 3*SIN_LEN) pos &= SIN_MASK>>1;
++ if (wave_tab == 4*SIN_LEN || wave_tab == 5*SIN_LEN) {
++ if (wave_tab == 5*SIN_LEN) pos &= SIN_MASK>>1;
++ pos *= 2;
++ if (pos & (SIN_LEN>>1)) pos = 0;
++ }
++ if (wave_tab != 6*SIN_LEN && wave_tab != 7*SIN_LEN) wave_tab = 0;
++#endif
++ p = (env<<4) + sin_tab[wave_tab + (pos & SIN_MASK)];
+
+- p = (env<<4) + sin_tab[wave_tab + ((((signed int)((phase & ~FREQ_MASK) + (pm<<16))) >> FREQ_SH ) & SIN_MASK) ];
+-
+- if (p >= TL_TAB_LEN)
+- return 0;
++#if 1
++ return tl_tab[p&(TL_TAB_LEN/13-1)] >> (p/(TL_TAB_LEN/13));
++#else
++ if (p > TL_TAB_LEN) return 0;
+ return tl_tab[p];
++#endif
+ }
+
+ INLINE signed int op_calc1(UINT32 phase, unsigned int env, signed int pm, unsigned int wave_tab)
+ {
+ UINT32 p;
++ int pos = (((signed int)((phase & ~FREQ_MASK) + pm)) >> FREQ_SH );
++#ifdef SMALL_CACHE
++ if ((wave_tab == 1*SIN_LEN) && (pos & (SIN_LEN>>1))) pos = 0;
++ if ((wave_tab == 3*SIN_LEN) && (pos & (SIN_LEN>>2))) pos = 0;
++ if (wave_tab == 2*SIN_LEN || wave_tab == 3*SIN_LEN) pos &= SIN_MASK>>1;
++ if (wave_tab == 4*SIN_LEN || wave_tab == 5*SIN_LEN) {
++ if (wave_tab == 5*SIN_LEN) pos &= SIN_MASK>>1;
++ pos *= 2;
++ if (pos & (SIN_LEN>>1)) pos = 0;
++ }
++ if (wave_tab != 6*SIN_LEN && wave_tab != 7*SIN_LEN) wave_tab = 0;
++#endif
+
+- p = (env<<4) + sin_tab[wave_tab + ((((signed int)((phase & ~FREQ_MASK) + pm))>>FREQ_SH) & SIN_MASK)];
++ p = (env<<4) + sin_tab[wave_tab + (pos & SIN_MASK)];
+
+- if (p >= TL_TAB_LEN)
+- return 0;
++#if 1
++ return tl_tab[p&(TL_TAB_LEN/13-1)] >> (p/(TL_TAB_LEN/13));
++#else
++ if (p > TL_TAB_LEN) return 0;
+ return tl_tab[p];
++#endif
+ }
+
+
+diff -ruN src./dosbox.cpp src/dosbox.cpp
+--- dupa/src./dosbox.cpp 2004-09-30 15:15:59.000000000 +0200
++++ dupa/src/dosbox.cpp 2004-09-30 15:18:48.301932384 +0200
+@@ -231,11 +231,17 @@
+ secprop->Add_int("frameskip",0);
+ secprop->Add_bool("aspect",false);
+ secprop->Add_string("scaler","normal2x");
++ secprop->Add_int("hq2x_threshold_adaptive",75);
++ secprop->Add_int("hq2x_threshold",0);
+ MSG_Add("RENDER_CONFIGFILE_HELP",
+ "frameskip -- How many frames dosbox skips before drawing one.\n"
+ "aspect -- Do aspect correction.\n"
+ "scaler -- Scaler used to enlarge/enhance low resolution modes.\n"
+- " Supported are none,normal2x,advmame2x,advmame3x,advinterp2x,interp2x,tv2x.\n"
++ " Supported are none,normal2x,advmame2x,advmame3x,advinterp2x,interp2x,tv2x,hq2x.\n"
++ "hq2x_threshold_adaptive -- The adaptive threshold used to detect edges in hq2x\n"
++ " Possible values are 0-100, can be modified with Ctrl+Alt+F5/F6\n"
++ "hq2x_threshold -- The static threshold used to detect edges in hq2x\n"
++ " Possible values are 0-255, can be modified with Ctrl+Alt+F3/F4\n"
+ );
+
+ secprop=control->AddSection_prop("cpu",&CPU_Init);