--- gdal-1.11.0/alg/gdalgrid_priv.h.orig 2014-04-16 22:04:48.000000000 +0200 +++ gdal-1.11.0/alg/gdalgrid_priv.h 2014-05-11 20:50:49.579220569 +0200 @@ -51,6 +51,21 @@ const float *pafZ; } GDALGridExtraParameters; +#ifdef HAVE_SSE_AT_COMPILE_TIME +int CPLHaveRuntimeSSE(); + +CPLErr +GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( + const void *poOptions, + GUInt32 nPoints, + const double *unused_padfX, + const double *unused_padfY, + const double *unused_padfZ, + double dfXPoint, double dfYPoint, + double *pdfValue, + void* hExtraParamsIn ); +#endif + #ifdef HAVE_AVX_AT_COMPILE_TIME int CPLHaveRuntimeAVX(); --- gdal-1.11.0/alg/gdalgridsse.cpp.orig 1970-01-01 01:00:00.000000000 +0100 +++ gdal-1.11.0/alg/gdalgridsse.cpp 2014-05-11 21:54:46.609140595 +0200 @@ -0,0 +1,210 @@ +#include "gdalgrid.h" +#include "gdalgrid_priv.h" + +#ifdef HAVE_SSE_AT_COMPILE_TIME +#include + +/************************************************************************/ +/* CPLHaveRuntimeSSE() */ +/************************************************************************/ + +#define CPUID_SSE_EDX_BIT 25 + +#if (defined(_M_X64) || defined(__x86_64)) + +int CPLHaveRuntimeSSE() +{ + return TRUE; +} + +#elif defined(__GNUC__) && defined(__i386__) + +int CPLHaveRuntimeSSE() +{ + int cpuinfo[4] = {0,0,0,0}; + GCC_CPUID(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0; +} + +#elif defined(_MSC_VER) && defined(_M_IX86) + +#if _MSC_VER <= 1310 +static void inline __cpuid(int cpuinfo[4], int level) +{ + __asm + { + push ebx + push esi + + mov esi,cpuinfo + mov eax,level + cpuid + mov dword ptr [esi], eax + mov dword ptr [esi+4],ebx + mov dword ptr [esi+8],ecx + mov dword ptr [esi+0Ch],edx + + pop esi + pop ebx + } +} +#else +#include +#endif + +int CPLHaveRuntimeSSE() +{ + int cpuinfo[4] = {0,0,0,0}; + __cpuid(cpuinfo, 1); + return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0; +} + +#else + +int CPLHaveRuntimeSSE() +{ + return FALSE; +} +#endif + +/************************************************************************/ +/* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ +/************************************************************************/ + +CPLErr +GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( + const void *poOptions, + GUInt32 nPoints, + const double *unused_padfX, + const double *unused_padfY, + const double *unused_padfZ, + double dfXPoint, double dfYPoint, + double *pdfValue, + void* hExtraParamsIn ) +{ + size_t i = 0; + GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; + const float* pafX = psExtraParams->pafX; + const float* pafY = psExtraParams->pafY; + const float* pafZ = psExtraParams->pafZ; + + const float fEpsilon = 0.0000000000001f; + const float fXPoint = (float)dfXPoint; + const float fYPoint = (float)dfYPoint; + const __m128 xmm_small = _mm_load1_ps((float*)&fEpsilon); + const __m128 xmm_x = _mm_load1_ps((float*)&fXPoint); + const __m128 xmm_y = _mm_load1_ps((float*)&fYPoint); + __m128 xmm_nominator = _mm_setzero_ps(); + __m128 xmm_denominator = _mm_setzero_ps(); + int mask = 0; + +#if defined(__x86_64) || defined(_M_X64) + /* This would also work in 32bit mode, but there are only 8 XMM registers */ + /* whereas we have 16 for 64bit */ +#define LOOP_SIZE 8 + size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; + for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) + { + __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */ + __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); + __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */ + __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); + __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ + _mm_mul_ps(xmm_ry, xmm_ry)); + __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), + _mm_mul_ps(xmm_ry_4, xmm_ry_4)); + __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ + __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); + xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ + _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); + xmm_nominator = _mm_add_ps(xmm_nominator, + _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); + xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ + xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); + mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | /* if( r2 < fEpsilon) */ + (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); + if( mask ) + break; + } +#else +#define LOOP_SIZE 4 + size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; + for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) + { + __m128 xmm_rx = _mm_sub_ps(_mm_load_ps((float*)pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */ + __m128 xmm_ry = _mm_sub_ps(_mm_load_ps((float*)pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */ + __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ + _mm_mul_ps(xmm_ry, xmm_ry)); + __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ + xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ + _mm_mul_ps(xmm_invr2, _mm_load_ps((float*)pafZ + i))); + xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ + mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ + if( mask ) + break; + } +#endif + + /* Find which i triggered r2 < fEpsilon */ + if( mask ) + { + for(int j = 0; j < LOOP_SIZE; j++ ) + { + if( mask & (1 << j) ) + { + (*pdfValue) = (pafZ)[i + j]; + return CE_None; + } + } + } + + /* Get back nominator and denominator values for XMM registers */ + float afNominator[4], afDenominator[4]; + _mm_storeu_ps(afNominator, xmm_nominator); + _mm_storeu_ps(afDenominator, xmm_denominator); + + float fNominator = afNominator[0] + afNominator[1] + + afNominator[2] + afNominator[3]; + float fDenominator = afDenominator[0] + afDenominator[1] + + afDenominator[2] + afDenominator[3]; + + /* Do the few remaining loop iterations */ + for ( ; i < nPoints; i++ ) + { + const float fRX = pafX[i] - fXPoint; + const float fRY = pafY[i] - fYPoint; + const float fR2 = + fRX * fRX + fRY * fRY; + + // If the test point is close to the grid node, use the point + // value directly as a node value to avoid singularity. + if ( fR2 < 0.0000000000001 ) + { + break; + } + else + { + const float fInvR2 = 1.0f / fR2; + fNominator += fInvR2 * pafZ[i]; + fDenominator += fInvR2; + } + } + + if( i != nPoints ) + { + (*pdfValue) = pafZ[i]; + } + else + if ( fDenominator == 0.0 ) + { + (*pdfValue) = + ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; + } + else + (*pdfValue) = fNominator / fDenominator; + + return CE_None; +} + + +#endif --- gdal-1.11.0/alg/gdalgrid.cpp.orig 2014-04-16 22:04:48.000000000 +0200 +++ gdal-1.11.0/alg/gdalgrid.cpp 2014-05-11 21:27:49.735840961 +0200 @@ -36,10 +36,6 @@ #include "cpl_multiproc.h" #include "gdalgrid_priv.h" -#ifdef HAVE_SSE_AT_COMPILE_TIME -#include -#endif - CPL_CVSID("$Id: gdalgrid.cpp 27110 2014-03-28 21:29:20Z rouault $"); #define TO_RADIANS (3.14159265358979323846 / 180.0) @@ -53,74 +49,6 @@ #endif /* DBL_MAX */ /************************************************************************/ -/* CPLHaveRuntimeSSE() */ -/************************************************************************/ - -#ifdef HAVE_SSE_AT_COMPILE_TIME - -#define CPUID_SSE_EDX_BIT 25 - -#if (defined(_M_X64) || defined(__x86_64)) - -static int CPLHaveRuntimeSSE() -{ - return TRUE; -} - -#elif defined(__GNUC__) && defined(__i386__) - -static int CPLHaveRuntimeSSE() -{ - int cpuinfo[4] = {0,0,0,0}; - GCC_CPUID(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); - return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0; -} - -#elif defined(_MSC_VER) && defined(_M_IX86) - -#if _MSC_VER <= 1310 -static void inline __cpuid(int cpuinfo[4], int level) -{ - __asm - { - push ebx - push esi - - mov esi,cpuinfo - mov eax,level - cpuid - mov dword ptr [esi], eax - mov dword ptr [esi+4],ebx - mov dword ptr [esi+8],ecx - mov dword ptr [esi+0Ch],edx - - pop esi - pop ebx - } -} -#else -#include -#endif - -static int CPLHaveRuntimeSSE() -{ - int cpuinfo[4] = {0,0,0,0}; - __cpuid(cpuinfo, 1); - return (cpuinfo[3] & (1 << CPUID_SSE_EDX_BIT)) != 0; -} - -#else - -static int CPLHaveRuntimeSSE() -{ - return FALSE; -} - -#endif - -#endif // HAVE_SSE_AT_COMPILE_TIME - -/************************************************************************/ /* GDALGridGetPointBounds() */ /************************************************************************/ @@ -394,148 +322,6 @@ } /************************************************************************/ -/* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */ -/************************************************************************/ - -#ifdef HAVE_SSE_AT_COMPILE_TIME - -static CPLErr -GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( - const void *poOptions, - GUInt32 nPoints, - const double *unused_padfX, - const double *unused_padfY, - const double *unused_padfZ, - double dfXPoint, double dfYPoint, - double *pdfValue, - void* hExtraParamsIn ) -{ - size_t i = 0; - GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; - const float* pafX = psExtraParams->pafX; - const float* pafY = psExtraParams->pafY; - const float* pafZ = psExtraParams->pafZ; - - const float fEpsilon = 0.0000000000001f; - const float fXPoint = (float)dfXPoint; - const float fYPoint = (float)dfYPoint; - const __m128 xmm_small = _mm_load1_ps((float*)&fEpsilon); - const __m128 xmm_x = _mm_load1_ps((float*)&fXPoint); - const __m128 xmm_y = _mm_load1_ps((float*)&fYPoint); - __m128 xmm_nominator = _mm_setzero_ps(); - __m128 xmm_denominator = _mm_setzero_ps(); - int mask = 0; - -#if defined(__x86_64) || defined(_M_X64) - /* This would also work in 32bit mode, but there are only 8 XMM registers */ - /* whereas we have 16 for 64bit */ -#define LOOP_SIZE 8 - size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; - for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) - { - __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */ - __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); - __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */ - __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); - __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ - _mm_mul_ps(xmm_ry, xmm_ry)); - __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), - _mm_mul_ps(xmm_ry_4, xmm_ry_4)); - __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ - __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); - xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ - _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); - xmm_nominator = _mm_add_ps(xmm_nominator, - _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); - xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ - xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); - mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | /* if( r2 < fEpsilon) */ - (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); - if( mask ) - break; - } -#else -#define LOOP_SIZE 4 - size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; - for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) - { - __m128 xmm_rx = _mm_sub_ps(_mm_load_ps((float*)pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */ - __m128 xmm_ry = _mm_sub_ps(_mm_load_ps((float*)pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */ - __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ - _mm_mul_ps(xmm_ry, xmm_ry)); - __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ - xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ - _mm_mul_ps(xmm_invr2, _mm_load_ps((float*)pafZ + i))); - xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ - mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ - if( mask ) - break; - } -#endif - - /* Find which i triggered r2 < fEpsilon */ - if( mask ) - { - for(int j = 0; j < LOOP_SIZE; j++ ) - { - if( mask & (1 << j) ) - { - (*pdfValue) = (pafZ)[i + j]; - return CE_None; - } - } - } - - /* Get back nominator and denominator values for XMM registers */ - float afNominator[4], afDenominator[4]; - _mm_storeu_ps(afNominator, xmm_nominator); - _mm_storeu_ps(afDenominator, xmm_denominator); - - float fNominator = afNominator[0] + afNominator[1] + - afNominator[2] + afNominator[3]; - float fDenominator = afDenominator[0] + afDenominator[1] + - afDenominator[2] + afDenominator[3]; - - /* Do the few remaining loop iterations */ - for ( ; i < nPoints; i++ ) - { - const float fRX = pafX[i] - fXPoint; - const float fRY = pafY[i] - fYPoint; - const float fR2 = - fRX * fRX + fRY * fRY; - - // If the test point is close to the grid node, use the point - // value directly as a node value to avoid singularity. - if ( fR2 < 0.0000000000001 ) - { - break; - } - else - { - const float fInvR2 = 1.0f / fR2; - fNominator += fInvR2 * pafZ[i]; - fDenominator += fInvR2; - } - } - - if( i != nPoints ) - { - (*pdfValue) = pafZ[i]; - } - else - if ( fDenominator == 0.0 ) - { - (*pdfValue) = - ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; - } - else - (*pdfValue) = fNominator / fDenominator; - - return CE_None; -} -#endif // HAVE_SSE_AT_COMPILE_TIME - -/************************************************************************/ /* GDALGridMovingAverage() */ /************************************************************************/ @@ -1766,7 +1552,9 @@ pabyX = pabyY = pabyZ = NULL; } } +# ifdef HAVE_SSE_AT_COMPILE_TIME else +# endif #endif #ifdef HAVE_SSE_AT_COMPILE_TIME --- gdal-1.11.0/alg/GNUmakefile.orig 2014-04-16 22:04:48.000000000 +0200 +++ gdal-1.11.0/alg/GNUmakefile 2014-05-11 21:56:55.699137906 +0200 @@ -16,6 +16,10 @@ CPPFLAGS := -DHAVE_AVX_AT_COMPILE_TIME $(CPPFLAGS) endif +ifeq ($(HAVE_SSE_AT_COMPILE_TIME),yes) +CPPFLAGS := -DHAVE_SSE_AT_COMPILE_TIME $(CPPFLAGS) +endif + ifeq ($(HAVE_GEOS),yes) CPPFLAGS := -DHAVE_GEOS=1 $(GEOS_CFLAGS) $(CPPFLAGS) endif @@ -26,11 +30,14 @@ CPPFLAGS := $(GDAL_INCLUDE) $(CPPFLAGS) $(OPENCL_FLAGS) -default: $(OBJ:.o=.$(OBJ_EXT)) gdalgridavx.$(OBJ_EXT) +default: $(OBJ:.o=.$(OBJ_EXT)) gdalgridavx.$(OBJ_EXT) gdalgridsse.$(OBJ_EXT) gdalgridavx.$(OBJ_EXT): gdalgridavx.cpp $(CXX) $(CXXFLAGS) $(AVXFLAGS) $(CPPFLAGS) -c -o $@ $< +gdalgridsse.$(OBJ_EXT): gdalgridsse.cpp + $(CXX) $(CXXFLAGS) $(SSEFLAGS) $(CPPFLAGS) -c -o $@ $< + clean: $(RM) *.o $(O_OBJ) --- gdal-1.11.0/configure.in.orig 2014-05-11 20:11:46.272602746 +0200 +++ gdal-1.11.0/configure.in 2014-05-11 22:00:20.125800312 +0200 @@ -240,12 +240,12 @@ echo '#endif' >> detectsse.cpp if test -z "`${CXX} ${CXXFLAGS} -o detectsse detectsse.cpp 2>&1`" ; then AC_MSG_RESULT([yes]) - SSEFLAGS="-DHAVE_SSE_AT_COMPILE_TIME" + SSEFLAGS="" HAVE_SSE_AT_COMPILE_TIME=yes else if test -z "`${CXX} ${CXXFLAGS} -msse -o detectsse detectsse.cpp 2>&1`" ; then AC_MSG_RESULT([yes]) - SSEFLAGS="-msse -DHAVE_SSE_AT_COMPILE_TIME" + SSEFLAGS="-msse" HAVE_SSE_AT_COMPILE_TIME=yes else AC_MSG_RESULT([no]) @@ -279,16 +279,14 @@ esac fi - if test "$HAVE_SSE_AT_COMPILE_TIME" = "yes"; then - CFLAGS="$CFLAGS $SSEFLAGS" - CXXFLAGS="$CXXFLAGS $SSEFLAGS" - fi - rm -f detectsse* else AC_MSG_RESULT([no]) fi +AC_SUBST(SSEFLAGS,$SSEFLAGS) +AC_SUBST(HAVE_SSE_AT_COMPILE_TIME,$HAVE_SSE_AT_COMPILE_TIME) + dnl --------------------------------------------------------------------------- dnl Check AVX availability dnl --------------------------------------------------------------------------- --- gdal-1.11.0/GDALmake.opt.in.orig 2014-05-12 19:27:07.164191074 +0200 +++ gdal-1.11.0/GDALmake.opt.in 2014-05-12 20:39:04.850767745 +0200 @@ -37,6 +37,8 @@ $(PCIDSK_LIB) $(RASDAMAN_LIB) $(CHARLS_LIB) $(SOSI_LIB) \ $(OPENCL_LIB) $(JVM_LIB) $(LIBICONV) $(FGDB_LIB) $(LIBXML2_LIB) +SSEFLAGS = @SSEFLAGS@ +HAVE_SSE_AT_COMPILE_TIME = @HAVE_SSE_AT_COMPILE_TIME@ AVXFLAGS = @AVXFLAGS@ HAVE_AVX_AT_COMPILE_TIME = @HAVE_AVX_AT_COMPILE_TIME@