1 --- gdal-1.11.0/alg/gdalgrid_priv.h.orig 2014-04-16 22:04:48.000000000 +0200
2 +++ gdal-1.11.0/alg/gdalgrid_priv.h 2014-05-11 20:50:49.579220569 +0200
5 } GDALGridExtraParameters;
7 +#ifdef HAVE_SSE_AT_COMPILE_TIME
8 +int CPLHaveRuntimeSSE();
11 +GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
12 + const void *poOptions,
14 + const double *unused_padfX,
15 + const double *unused_padfY,
16 + const double *unused_padfZ,
17 + double dfXPoint, double dfYPoint,
19 + void* hExtraParamsIn );
22 #ifdef HAVE_AVX_AT_COMPILE_TIME
23 int CPLHaveRuntimeAVX();
25 --- gdal-1.11.0/alg/gdalgridsse.cpp.orig 1970-01-01 01:00:00.000000000 +0100
26 +++ gdal-1.11.0/alg/gdalgridsse.cpp 2014-05-11 21:54:46.609140595 +0200
28 +#include "gdalgrid.h"
29 +#include "gdalgrid_priv.h"
31 +#ifdef HAVE_SSE_AT_COMPILE_TIME
32 +#include <xmmintrin.h>
34 +/************************************************************************/
35 +/* CPLHaveRuntimeSSE() */
36 +/************************************************************************/
38 +#define CPUID_SSE_EDX_BIT 25
40 +#if (defined(_M_X64) || defined(__x86_64))
42 +int CPLHaveRuntimeSSE()
47 +#elif defined(__GNUC__) && defined(__i386__)
49 +int CPLHaveRuntimeSSE()
51 + int cpuinfo[4] = {0,0,0,0};
52 + GCC_CPUID(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
53 + return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0;
56 +#elif defined(_MSC_VER) && defined(_M_IX86)
59 +static void inline __cpuid(int cpuinfo[4], int level)
69 + mov dword ptr [esi], eax
70 + mov dword ptr [esi+4],ebx
71 + mov dword ptr [esi+8],ecx
72 + mov dword ptr [esi+0Ch],edx
82 +int CPLHaveRuntimeSSE()
84 + int cpuinfo[4] = {0,0,0,0};
85 + __cpuid(cpuinfo, 1);
86 + return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0;
91 +int CPLHaveRuntimeSSE()
97 +/************************************************************************/
98 +/* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */
99 +/************************************************************************/
102 +GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
103 + const void *poOptions,
105 + CPL_UNUSED const double *unused_padfX,
106 + CPL_UNUSED const double *unused_padfY,
107 + CPL_UNUSED const double *unused_padfZ,
108 + double dfXPoint, double dfYPoint,
110 + void* hExtraParamsIn )
113 + GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn;
114 + const float* pafX = psExtraParams->pafX;
115 + const float* pafY = psExtraParams->pafY;
116 + const float* pafZ = psExtraParams->pafZ;
118 + const float fEpsilon = 0.0000000000001f;
119 + const float fXPoint = (float)dfXPoint;
120 + const float fYPoint = (float)dfYPoint;
121 + const __m128 xmm_small = _mm_load1_ps((float*)&fEpsilon);
122 + const __m128 xmm_x = _mm_load1_ps((float*)&fXPoint);
123 + const __m128 xmm_y = _mm_load1_ps((float*)&fYPoint);
124 + __m128 xmm_nominator = _mm_setzero_ps();
125 + __m128 xmm_denominator = _mm_setzero_ps();
128 +#if defined(__x86_64) || defined(_M_X64)
129 + /* This would also work in 32bit mode, but there are only 8 XMM registers */
130 + /* whereas we have 16 for 64bit */
132 + size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
133 + for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
135 + __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */
136 + __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
137 + __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */
138 + __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
139 + __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
140 + _mm_mul_ps(xmm_ry, xmm_ry));
141 + __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
142 + _mm_mul_ps(xmm_ry_4, xmm_ry_4));
143 + __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
144 + __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
145 + xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
146 + _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
147 + xmm_nominator = _mm_add_ps(xmm_nominator,
148 + _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
149 + xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
150 + xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
151 + mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | /* if( r2 < fEpsilon) */
152 + (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
158 + size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
159 + for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
161 + __m128 xmm_rx = _mm_sub_ps(_mm_load_ps((float*)pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */
162 + __m128 xmm_ry = _mm_sub_ps(_mm_load_ps((float*)pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */
163 + __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
164 + _mm_mul_ps(xmm_ry, xmm_ry));
165 + __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
166 + xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
167 + _mm_mul_ps(xmm_invr2, _mm_load_ps((float*)pafZ + i)));
168 + xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
169 + mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */
175 + /* Find which i triggered r2 < fEpsilon */
178 + for(int j = 0; j < LOOP_SIZE; j++ )
180 + if( mask & (1 << j) )
182 + (*pdfValue) = (pafZ)[i + j];
188 + /* Get back nominator and denominator values for XMM registers */
189 + float afNominator[4], afDenominator[4];
190 + _mm_storeu_ps(afNominator, xmm_nominator);
191 + _mm_storeu_ps(afDenominator, xmm_denominator);
193 + float fNominator = afNominator[0] + afNominator[1] +
194 + afNominator[2] + afNominator[3];
195 + float fDenominator = afDenominator[0] + afDenominator[1] +
196 + afDenominator[2] + afDenominator[3];
198 + /* Do the few remaining loop iterations */
199 + for ( ; i < nPoints; i++ )
201 + const float fRX = pafX[i] - fXPoint;
202 + const float fRY = pafY[i] - fYPoint;
204 + fRX * fRX + fRY * fRY;
206 + // If the test point is close to the grid node, use the point
207 + // value directly as a node value to avoid singularity.
208 + if ( fR2 < 0.0000000000001 )
214 + const float fInvR2 = 1.0f / fR2;
215 + fNominator += fInvR2 * pafZ[i];
216 + fDenominator += fInvR2;
222 + (*pdfValue) = pafZ[i];
225 + if ( fDenominator == 0.0 )
228 + ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue;
231 + (*pdfValue) = fNominator / fDenominator;
238 --- gdal-1.11.0/alg/gdalgrid.cpp.orig 2014-04-16 22:04:48.000000000 +0200
239 +++ gdal-1.11.0/alg/gdalgrid.cpp 2014-05-11 21:27:49.735840961 +0200
241 #include "cpl_multiproc.h"
242 #include "gdalgrid_priv.h"
244 -#ifdef HAVE_SSE_AT_COMPILE_TIME
245 -#include <xmmintrin.h>
248 CPL_CVSID("$Id: gdalgrid.cpp 27729 2014-09-24 00:40:16Z goatbar $");
250 #define TO_RADIANS (3.14159265358979323846 / 180.0)
254 /************************************************************************/
255 -/* CPLHaveRuntimeSSE() */
256 -/************************************************************************/
258 -#ifdef HAVE_SSE_AT_COMPILE_TIME
260 -#define CPUID_SSE_EDX_BIT 25
262 -#if (defined(_M_X64) || defined(__x86_64))
264 -static int CPLHaveRuntimeSSE()
269 -#elif defined(__GNUC__) && defined(__i386__)
271 -static int CPLHaveRuntimeSSE()
273 - int cpuinfo[4] = {0,0,0,0};
274 - GCC_CPUID(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
275 - return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0;
278 -#elif defined(_MSC_VER) && defined(_M_IX86)
280 -#if _MSC_VER <= 1310
281 -static void inline __cpuid(int cpuinfo[4], int level)
291 - mov dword ptr [esi], eax
292 - mov dword ptr [esi+4],ebx
293 - mov dword ptr [esi+8],ecx
294 - mov dword ptr [esi+0Ch],edx
304 -static int CPLHaveRuntimeSSE()
306 - int cpuinfo[4] = {0,0,0,0};
307 - __cpuid(cpuinfo, 1);
308 - return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0;
313 -static int CPLHaveRuntimeSSE()
320 -#endif // HAVE_SSE_AT_COMPILE_TIME
322 -/************************************************************************/
323 /* GDALGridGetPointBounds() */
324 /************************************************************************/
326 @@ -394,148 +322,6 @@
329 /************************************************************************/
330 -/* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */
331 -/************************************************************************/
333 -#ifdef HAVE_SSE_AT_COMPILE_TIME
336 -GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
337 - const void *poOptions,
339 - CPL_UNUSED const double *unused_padfX,
340 - CPL_UNUSED const double *unused_padfY,
341 - CPL_UNUSED const double *unused_padfZ,
342 - double dfXPoint, double dfYPoint,
344 - void* hExtraParamsIn )
347 - GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn;
348 - const float* pafX = psExtraParams->pafX;
349 - const float* pafY = psExtraParams->pafY;
350 - const float* pafZ = psExtraParams->pafZ;
352 - const float fEpsilon = 0.0000000000001f;
353 - const float fXPoint = (float)dfXPoint;
354 - const float fYPoint = (float)dfYPoint;
355 - const __m128 xmm_small = _mm_load1_ps((float*)&fEpsilon);
356 - const __m128 xmm_x = _mm_load1_ps((float*)&fXPoint);
357 - const __m128 xmm_y = _mm_load1_ps((float*)&fYPoint);
358 - __m128 xmm_nominator = _mm_setzero_ps();
359 - __m128 xmm_denominator = _mm_setzero_ps();
362 -#if defined(__x86_64) || defined(_M_X64)
363 - /* This would also work in 32bit mode, but there are only 8 XMM registers */
364 - /* whereas we have 16 for 64bit */
366 - size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
367 - for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
369 - __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */
370 - __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x);
371 - __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */
372 - __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y);
373 - __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
374 - _mm_mul_ps(xmm_ry, xmm_ry));
375 - __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4),
376 - _mm_mul_ps(xmm_ry_4, xmm_ry_4));
377 - __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
378 - __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4);
379 - xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
380 - _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i)));
381 - xmm_nominator = _mm_add_ps(xmm_nominator,
382 - _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4)));
383 - xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
384 - xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4);
385 - mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | /* if( r2 < fEpsilon) */
386 - (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4);
392 - size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
393 - for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
395 - __m128 xmm_rx = _mm_sub_ps(_mm_load_ps((float*)pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */
396 - __m128 xmm_ry = _mm_sub_ps(_mm_load_ps((float*)pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */
397 - __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */
398 - _mm_mul_ps(xmm_ry, xmm_ry));
399 - __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */
400 - xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */
401 - _mm_mul_ps(xmm_invr2, _mm_load_ps((float*)pafZ + i)));
402 - xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */
403 - mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */
409 - /* Find which i triggered r2 < fEpsilon */
412 - for(int j = 0; j < LOOP_SIZE; j++ )
414 - if( mask & (1 << j) )
416 - (*pdfValue) = (pafZ)[i + j];
422 - /* Get back nominator and denominator values for XMM registers */
423 - float afNominator[4], afDenominator[4];
424 - _mm_storeu_ps(afNominator, xmm_nominator);
425 - _mm_storeu_ps(afDenominator, xmm_denominator);
427 - float fNominator = afNominator[0] + afNominator[1] +
428 - afNominator[2] + afNominator[3];
429 - float fDenominator = afDenominator[0] + afDenominator[1] +
430 - afDenominator[2] + afDenominator[3];
432 - /* Do the few remaining loop iterations */
433 - for ( ; i < nPoints; i++ )
435 - const float fRX = pafX[i] - fXPoint;
436 - const float fRY = pafY[i] - fYPoint;
438 - fRX * fRX + fRY * fRY;
440 - // If the test point is close to the grid node, use the point
441 - // value directly as a node value to avoid singularity.
442 - if ( fR2 < 0.0000000000001 )
448 - const float fInvR2 = 1.0f / fR2;
449 - fNominator += fInvR2 * pafZ[i];
450 - fDenominator += fInvR2;
456 - (*pdfValue) = pafZ[i];
459 - if ( fDenominator == 0.0 )
462 - ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue;
465 - (*pdfValue) = fNominator / fDenominator;
469 -#endif // HAVE_SSE_AT_COMPILE_TIME
471 -/************************************************************************/
472 /* GDALGridMovingAverage() */
473 /************************************************************************/
475 --- gdal-1.11.0/alg/GNUmakefile.orig 2014-04-16 22:04:48.000000000 +0200
476 +++ gdal-1.11.0/alg/GNUmakefile 2014-05-11 21:56:55.699137906 +0200
478 CPPFLAGS := -DHAVE_AVX_AT_COMPILE_TIME $(CPPFLAGS)
481 +ifeq ($(HAVE_SSE_AT_COMPILE_TIME),yes)
482 +CPPFLAGS := -DHAVE_SSE_AT_COMPILE_TIME $(CPPFLAGS)
485 ifeq ($(HAVE_GEOS),yes)
486 CPPFLAGS := -DHAVE_GEOS=1 $(GEOS_CFLAGS) $(CPPFLAGS)
490 CPPFLAGS := $(GDAL_INCLUDE) $(CPPFLAGS) $(OPENCL_FLAGS)
492 -default: $(OBJ:.o=.$(OBJ_EXT)) gdalgridavx.$(OBJ_EXT)
493 +default: $(OBJ:.o=.$(OBJ_EXT)) gdalgridavx.$(OBJ_EXT) gdalgridsse.$(OBJ_EXT)
495 gdalgridavx.$(OBJ_EXT): gdalgridavx.cpp
496 $(CXX) $(CXXFLAGS) $(AVXFLAGS) $(CPPFLAGS) -c -o $@ $<
498 +gdalgridsse.$(OBJ_EXT): gdalgridsse.cpp
499 + $(CXX) $(CXXFLAGS) $(SSEFLAGS) $(CPPFLAGS) -c -o $@ $<
504 --- gdal-1.11.0/configure.in.orig 2014-05-11 20:11:46.272602746 +0200
505 +++ gdal-1.11.0/configure.in 2014-05-11 22:00:20.125800312 +0200
506 @@ -240,12 +240,12 @@
507 echo '#endif' >> detectsse.cpp
508 if test -z "`${CXX} ${CXXFLAGS} -o detectsse detectsse.cpp 2>&1`" ; then
510 - SSEFLAGS="-DHAVE_SSE_AT_COMPILE_TIME"
512 HAVE_SSE_AT_COMPILE_TIME=yes
514 if test -z "`${CXX} ${CXXFLAGS} -msse -o detectsse detectsse.cpp 2>&1`" ; then
516 - SSEFLAGS="-msse -DHAVE_SSE_AT_COMPILE_TIME"
518 HAVE_SSE_AT_COMPILE_TIME=yes
521 @@ -279,16 +279,14 @@
525 - if test "$HAVE_SSE_AT_COMPILE_TIME" = "yes"; then
526 - CFLAGS="$CFLAGS $SSEFLAGS"
527 - CXXFLAGS="$CXXFLAGS $SSEFLAGS"
535 +AC_SUBST(SSEFLAGS,$SSEFLAGS)
536 +AC_SUBST(HAVE_SSE_AT_COMPILE_TIME,$HAVE_SSE_AT_COMPILE_TIME)
538 dnl ---------------------------------------------------------------------------
539 dnl Check AVX availability
540 dnl ---------------------------------------------------------------------------
541 --- gdal-1.11.0/GDALmake.opt.in.orig 2014-05-12 19:27:07.164191074 +0200
542 +++ gdal-1.11.0/GDALmake.opt.in 2014-05-12 20:39:04.850767745 +0200
544 $(PCIDSK_LIB) $(RASDAMAN_LIB) $(CHARLS_LIB) $(SOSI_LIB) \
545 $(OPENCL_LIB) $(JVM_LIB) $(LIBICONV) $(FGDB_LIB) $(LIBXML2_LIB)
547 +SSEFLAGS = @SSEFLAGS@
548 +HAVE_SSE_AT_COMPILE_TIME = @HAVE_SSE_AT_COMPILE_TIME@
549 AVXFLAGS = @AVXFLAGS@
550 HAVE_AVX_AT_COMPILE_TIME = @HAVE_AVX_AT_COMPILE_TIME@