当前位置: 首页 > 编程日记 > 正文

SSE4.1和SSE4.2 Intrinsics各函数介绍

SIMD相关头文件包括:

//#include <ivec.h>//MMX
//#include <fvec.h>//SSE(also include ivec.h)
//#include <dvec.h>//SSE2(also include fvec.h)#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <wmmintrin.h>//AES(include nmmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
#include <intrin.h>//(include immintrin.h)

mmintrin.h为MMX 头文件,其中__m64的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64
{unsigned __int64    m64_u64;float               m64_f32[2];__int8              m64_i8[8];__int16             m64_i16[4];__int32             m64_i32[2];    __int64             m64_i64;unsigned __int8     m64_u8[8];unsigned __int16    m64_u16[4];unsigned __int32    m64_u32[2];
} __m64;

xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {float               m128_f32[4];unsigned __int64    m128_u64[2];__int8              m128_i8[16];__int16             m128_i16[8];__int32             m128_i32[4];__int64             m128_i64[2];unsigned __int8     m128_u8[16];unsigned __int16    m128_u16[8];unsigned __int32    m128_u32[4];} __m128;

emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:

typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i {__int8              m128i_i8[16];__int16             m128i_i16[8];__int32             m128i_i32[4];    __int64             m128i_i64[2];unsigned __int8     m128i_u8[16];unsigned __int16    m128i_u16[8];unsigned __int32    m128i_u32[4];unsigned __int64    m128i_u64[2];
} __m128i;typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d {double              m128d_f64[2];
} __m128d;

smmintrin.h为SSE4.1头文件,其文件中各函数的介绍:

	/*Integer blend instructions - select data from 2 sourcesusing constant/variable mask*///v1=(v10, v11, ..., v17), v2=(v20, v21, ..., v27)//mask:If the corresponding flag bit is 0, the value is selected from parameter v1.//Otherwise the value is from parameter v2.//则r0=(mask0 == 0) ? v10 : v20,...,r7= (mask7 == 0) ? v17 : v27extern __m128i _mm_blend_epi16 (__m128i v1, __m128i v2, const int mask);//v1=(v10, v11, ..., v115), v2=(v20, v21, ..., v215), mask=(mask1, ..., mask15)//则r0=(mask0 & 0x80) ? v20 : v10, ..., r15=(mask15 & 0x80) ? v215 : v115extern __m128i _mm_blendv_epi8 (__m128i v1, __m128i v2, __m128i mask);/*Float single precision blend instructions - select datafrom 2 sources using constant/variable mask *///v1=(v10, v11, v12, v13), v2=(v20, v21, v22, v23)//则r0=(mask0 == 0) ? v10 : v20,..., r3= (mask3 == 0) ? v13 : v23extern __m128  _mm_blend_ps (__m128  v1, __m128  v2, const int mask);//v1=(v10, v11, v12, v13), v2=(v20, v21, v22, v23)//则r0= (v30 & 0x80000000) ? v20 : v10,...,r3= (v33 & 0x80000000) ? v23 : v13extern __m128  _mm_blendv_ps(__m128  v1, __m128  v2, __m128 v3);/*Float double precision blend instructions - select datafrom 2 sources using constant/variable mask*///v1=(v10, v11), v2=(v20, v21)//则r0 = (mask0 == 0) ? v10 : v20, r1 = (mask1 == 0) ? v11 : v21extern __m128d _mm_blend_pd (__m128d v1, __m128d v2, const int mask);//v1=(v10, v11), v2=(v20, v21)//则r0 = (v30 & 0x8000000000000000) ? v20 : v10,//r1 = (v31 & 0x8000000000000000) ? v21 : v11extern __m128d _mm_blendv_pd(__m128d v1, __m128d v2, __m128d v3);/*Dot product instructions with mask-defined summing and zeroingof result's parts*///val1=(val10, ..., val13), val2=(val20,...,val23)/*则tmp0 := (mask4 == 1) ? (val10 * val20) : +0.0tmp1 := (mask5 == 1) ? (val11 * val21) : +0.0tmp2 := (mask6 == 1) ? (val12 * val22) : +0.0tmp3 := (mask7 == 1) ? (val13 * val23) : +0.0tmp4 := tmp0 + tmp1 + tmp2 + tmp3r0 := (mask0 == 1) ? tmp4 : +0.0r1 := (mask1 == 1) ? tmp4 : +0.0r2 := (mask2 == 1) ? tmp4 : +0.0r3 := (mask3 == 1) ? tmp4 : +0.0 */extern __m128  _mm_dp_ps(__m128  val1, __m128  val2, const int mask);//val1=(val10, val11), val2=(val20, val21)/*则tmp0 := (mask4 == 1) ? (val10 * val20) : +0.0tmp1 := (mask5 == 1) ? (val11 * val21) : +0.0tmp2 := tmp0 + tmp1r0 := (mask0 == 1) ? tmp2 : +0.0r1 := (mask1 == 1) ? tmp2 : +0.0 */extern __m128d _mm_dp_pd(__m128d val1, __m128d val2, const int mask);/*Packed integer 64-bit comparison, zeroing or filling with onescorresponding parts of result *///val1=(val10, val11), val2=(val20, val21)//则r0 = (val10 == val20) ? 0xffffffffffffffff : 0,//r1 = (val11 == val21) ? 0xffffffffffffffff : 0extern __m128i _mm_cmpeq_epi64(__m128i val1, __m128i val2);/* Min/max packed integer instructions*///val1=(val10,...,val115), val2=(val20,...,val215)//则r0 = (val10 < val20) ? val10 : val20, ...,//r15 = (val115 < val215) ? val115 : val215extern __m128i _mm_min_epi8 (__m128i val1, __m128i val2);//val1=(val10,...,val115), val2=(val20,...,val215)//则r0 = (val10 > val20) ? val10 : val20, ...,//r15 = (val115 > val215) ? val115 : val215extern __m128i _mm_max_epi8 (__m128i val1, __m128i val2);//val1=(val10,...,val17), val2=(val20,...,val27), eight 16-bit unsigned integers//则r0 = (val10 < val20) ? val10 : val20, ...,//r7 = (val17 < val27) ? val17 : val27extern __m128i _mm_min_epu16(__m128i val1, __m128i val2);//val1=(val10,...,val17), val2=(val20,...,val27),eight 16-bit unsigned integers//则r0 = (val10 > val20) ? val10 : val20, ...,//r7 = (val17 > val27) ? val17 : val27extern __m128i _mm_max_epu16(__m128i val1, __m128i val2);//val1=(val10,...,val13), val2=(val20,...,val23)//则r0 = (val10 < val20) ? val10 : val20, ...,//r3 = (val13 < val23) ? val13 : val23extern __m128i _mm_min_epi32(__m128i val1, __m128i val2);//val1=(val10,...,val13), val2=(val20,...,val23)//则r0 = (val10 > val20) ? val10 : val20, ...,//r3 = (val13 > val23) ? val13 : val23extern __m128i _mm_max_epi32(__m128i val1, __m128i val2);//val1=(val10,...,val13), val2=(val20,...,val23), four 32-bit unsigned integers//则r0 = (val10 < val20) ? val10 : val20, ...,//r3 = (val13 < val23) ? val13 : val23extern __m128i _mm_min_epu32(__m128i val1, __m128i val2);//val1=(val10,...,val13), val2=(val20,...,val23), four 32-bit unsigned integers//则r0 = (val10 > val20) ? val10 : val20, ...,//r3 = (val13 > val23) ? val13 : val23extern __m128i _mm_max_epu32(__m128i val1, __m128i val2);/*Packed integer 32-bit multiplication with truncationof upper halves of results*///a=(a0,...,a3), b=(b0,...,b3), 则r0=a0 * b0, ..., r3=a3 * b3//Only the lower 32-bits of each product are savedextern __m128i _mm_mullo_epi32(__m128i a, __m128i b);/*Packed integer 32-bit multiplication of 2 pairs of operandsproducing two 64-bit results *///a=(a0,a1,a2,a3), b=(b0,b1,b2,b3)//r0=low_half(a0*b0), r1=high_half(a0*b0),r2=low_half(a2*b2), r3=high_half(a2*b2)//The upper 32-bits of each quadword of the input parameters are not usedextern __m128i _mm_mul_epi32(__m128i a, __m128i b);/*Packed integer 128-bit bitwise comparison.return 1 if (val 'and' mask) == 0*///则r = (mask & val) == 0, Generates a return value of 0 or 1extern int _mm_testz_si128(__m128i mask, __m128i val);/*Packed integer 128-bit bitwise comparison.return 1 if (val 'and_not' mask) == 0 *///则r=1 if all the bits set in val are set in mask; otherwise 0//Generates a return value of 0 or 1extern int _mm_testc_si128(__m128i mask, __m128i val);/*Packed integer 128-bit bitwise comparisonZF = ((val 'and' mask) == 0)  CF = ((val 'and_not' mask) == 0)return 1 if both ZF and CF are 0 *///则 ZF := (mask & s2) == 0,CF := (~mask & s2) == 0, r = ~ZF & ~CF//Generates a return value of 0 or 1extern int _mm_testnzc_si128(__m128i mask, __m128i s2);/*Insert single precision float into packed single precisionarray element selected by index.The bits [7-6] of the 3d parameter define src index,the bits [5-4] define dst index, and bits [3-0] define zeroingmask for dst *//*	sx := ndx6-7sval := (sx == 0) ? src0 : ((sx == 1) ? src1 : ((sx == 2) ? src2 : src3))dx := ndx4-5r0 := (dx == 0) ? sval : dst0r1 := (dx == 1) ? sval : dst1r2 := (dx == 2) ? sval : dst2r3 := (dx == 3) ? sval : dst3zmask := ndx0-3r0 := (zmask0 == 1) ? +0.0 : r0r1 := (zmask1 == 1) ? +0.0 : r1r2 := (zmask2 == 1) ? +0.0 : r2r3 := (zmask3 == 1) ? +0.0 : r3 */extern __m128 _mm_insert_ps(__m128 dst, __m128 src, const int ndx);/*Extract binary representation of single precision float frompacked single precision array element selected by index *///src=(src0, src1, src2, src3)//则r = (ndx == 0) ? src0 : ((ndx == 1) ? src1 : ((ndx == 2) ? src2 : src3))//Only the least significant two bits of ndx are usedextern int _mm_extract_ps(__m128 src, const int ndx);/*Insert integer into packed integer array elementselected by index *///则r0=(ndx == 0) ? s : dst0, ..., r15=(ndx == 15) ? s : dst15//Only the lowest 8 bits of s are used, //Only the least significant 4 bits of ndx are usedextern __m128i _mm_insert_epi8 (__m128i dst, int s, const int ndx);//则r0=(ndx == 0) ? s : dst0, ..., r3=(ndx == 3) ? s : dst3//Only the least significant 2 bits of ndx are interpretedextern __m128i _mm_insert_epi32(__m128i dst, int s, const int ndx);//则r0=(ndx == 0) ? s : dst0, r1=(ndx == 1) ? s : dst1//Only the least significant bit of ndx is interpretedextern __m128i _mm_insert_epi64(__m128i dst, __int64 s, const int ndx);/*Extract integer from packed integer array elementselected by index *///则r=(ndx == 0) ? src0 : ((ndx == 1) ? src1 : ...((ndx == 14) ? src14 : src15))//Only the least significant four bits of ndx are used//注意:The result is the unsigned equivalent of the appropriate 8-bits in parameter srcextern int _mm_extract_epi8 (__m128i src, const int ndx);//则r=(ndx == 0) ? src0 : ((ndx == 1) ? src1 : ((ndx == 2) ? src2 : src3))//Only the least significant two bits of ndx are used.extern int _mm_extract_epi32(__m128i src, const int ndx);//则r = (ndx == 0) ? src0 : src1//Only the least significant bit of parameter ndx is usedextern __int64 _mm_extract_epi64(__m128i src, const int ndx);/*Horizontal packed word minimum and its index inresult[15:0] and result[18:16] respectively *///The lowest order 16 bits are the minimum value found in parameter shortValues.//The second-lowest order 16 bits are the index of the minimum value //found in parameter shortValues.extern __m128i _mm_minpos_epu16(__m128i shortValues);/* Packed/single float double precision rounding *///则r0=RND(val0), r1=RND(val1),详见参考文献1extern __m128d _mm_round_pd(__m128d val, int iRoundMode);//则r0=RND(val0), r1=dst1, 详见参考文献1// The lowest 64 bits are the result of the rounding function on val.//The higher order 64 bits are copied directly from input parameter dstextern __m128d _mm_round_sd(__m128d dst, __m128d val, int iRoundMode);/*Packed/single float single precision rounding *///则r0=RND(val0), r1=RND(val1), r2=RND(val2), r3=RND(val3),详见参考文献1extern __m128  _mm_round_ps(__m128  val, int iRoundMode);//则r0=RND(val0), r1=dst1, r2=dst2, r3=dst3, 	//The lowest 32 bits are the result of the rounding function on val.//The higher order 96 bits are copied directly from input parameter dstextern __m128  _mm_round_ss(__m128 dst, __m128  val, int iRoundMode);/*Packed integer sign-extension *///byteValues: A 128-bit parameter that contains four signed 8-bit integers//in the lower 32 bits, byteValues=(a0, a1, ..., a15)/*则r0 := a0r1 := (a0 < 0) ? 0xff : 0r2 := (a0 < 0) ? 0xff : 0r3 := (a0 < 0) ? 0xff : 0r4 := a1r5 := (a1 < 0) ? 0xff : 0r6 := (a1 < 0) ? 0xff : 0r7 := (a1 < 0) ? 0xff : 0r8 := a2r9 := (a2 < 0) ? 0xff : 0r10 := (a2 < 0) ? 0xff : 0r11 := (a2 < 0) ? 0xff : 0r12 := a3r13 := (a3 < 0) ? 0xff : 0r14 := (a3 < 0) ? 0xff : 0r15 := (a3 < 0) ? 0xff : 0 */extern __m128i _mm_cvtepi8_epi32 (__m128i byteValues);//shortValues: A 128-bit parameter that contains four signed 16-bit integers//in the lower 64 bits, shortValues=(a0, a1, ..., a7)/*则r0 := a0r1 := (a0 < 0) ? 0xffff : 0r2 := a1r3 := (a1 < 0) ? 0xffff : 0r4 := a2r5 := (a2 < 0) ? 0xffff : 0r6 := a3r7 := (a3 < 0) ? 0xffff : 0 */extern __m128i _mm_cvtepi16_epi32(__m128i shortValues);//byteValues: A 128-bit parameter that contains two signed 8-bit integers//in the lower 16 bits, byteValues=(a0, a1, ... , a15)/*则r0 := a0r1 := (a0 < 0) ? 0xff : 0r2 := (a0 < 0) ? 0xff : 0r3 := (a0 < 0) ? 0xff : 0r4 := (a0 < 0) ? 0xff : 0r5 := (a0 < 0) ? 0xff : 0r6 := (a0 < 0) ? 0xff : 0r7 := (a0 < 0) ? 0xff : 0r8 := a1r9 := (a1 < 0) ? 0xff : 0r10 := (a1 < 0) ? 0xff : 0r11 := (a1 < 0) ? 0xff : 0r12 := (a1 < 0) ? 0xff : 0r13 := (a1 < 0) ? 0xff : 0r14 := (a1 < 0) ? 0xff : 0r15 := (a1 < 0) ? 0xff : 0 */extern __m128i _mm_cvtepi8_epi64 (__m128i byteValues); //intValues: A 128-bit parameter that contains two signed 32-bit //integers in the lower 64 bits, intValues=(a0, a1, a2, a3)/*则r0 := a0r1 := (a0 < 0) ? 0xffffffff : 0r2 := a1r3 := (a1 < 0) ? 0xffffffff : 0*/extern __m128i _mm_cvtepi32_epi64(__m128i intValues);//shortValues:A 128-bit parameter that contains two signed 16-bit integers//in the lower 32 bits, shortValues=(a0, a1, ..., a7)/*则r0 := a0r1 := (a0 < 0) ? 0xffff : 0r2 := (a0 < 0) ? 0xffff : 0r3 := (a0 < 0) ? 0xffff : 0r4 := a1r5 := (a1 < 0) ? 0xffff : 0r6 := (a1 < 0) ? 0xffff : 0r7 := (a1 < 0) ? 0xffff : 0*/extern __m128i _mm_cvtepi16_epi64(__m128i shortValues);//byteValues:A 128-bit parameter that contains eight signed 8-bit integers //in the lower 64 bits, byteValues=(a0, a1, ..., a15)/*则r0 := a0r1 := (a0 < 0) ? 0xff : 0r2 := a1r3 := (a1 < 0) ? 0xff : 0...r14 := a7r15 := (a7 < 0) ? 0xff : 0*/extern __m128i _mm_cvtepi8_epi16 (__m128i byteValues);/*Packed integer zero-extension*///byteValues:A 128-bit parameter that contains four unsigned 8-bit integers//in the lower 32 bits, byteValues=(a0, a1, ... , a15)/*则r0 := a0r1 := 0r2 := 0r3 := 0r4 := a1r5 := 0r6 := 0r7 := 0r8 := a2r9 := 0r10 := 0r11 := 0r12 := a3r13 := 0r14 := 0r15 := 0*/extern __m128i _mm_cvtepu8_epi32 (__m128i byteValues);//shortValues:A 128-bit parameter that contains four unsigned 16-bit integers//in the lower 64 bits, shortValues=(a0, a1, ... , a7)/*则r0 := a0r1 := 0r2 := a1r3 := 0r4 := a2r5 := 0r6 := a3r7 := 0*/extern __m128i _mm_cvtepu16_epi32(__m128i shortValues);//shortValues:A 128-bit parameter that contains two unsigned 8-bit integers//in the lower 16 bits, shortValues=(a0, a1, ..., a15)/*则r0 := a0r1 := 0r2 := 0r3 := 0r4 := 0r5 := 0r6 := 0r7 := 0r8 := a1r9 := 0r10 := 0r11 := 0r12 := 0r13 := 0r14 := 0r15 := 0*/extern __m128i _mm_cvtepu8_epi64 (__m128i shortValues);//intValues:A 128-bit parameter that contains two unsigned 32-bit integers//in the lower 64 bits, intValues=(a0, a1, a2, a3)/*则r0 = a0r1 = 0r2 = a1r3 = 0*/extern __m128i _mm_cvtepu32_epi64(__m128i intValues);//shortValues:A 128-bit parameter that contains two unsigned 16-bit integers//in the lower 32 bits, shortValues=(a0, a1, ... , a7)/*则r0 := a0r1 := 0r2 := 0r3 := 0r4 := a1r5 := 0r6 := 0r7 := 0*/extern __m128i _mm_cvtepu16_epi64(__m128i shortValues);//byteValues:A 128-bit parameter that contains eight unsigned 8-bit integers //in the lower 64 bits, byteValues=(a0, a1, ... , a15)/*则r0 := a0r1 := 0r2 := a1r3 := 0...r14 := a7r15 := 0*/extern __m128i _mm_cvtepu8_epi16 (__m128i byteValues);/*Pack 8 double words from 2 operands into 8 words of resultwith unsigned saturation *///val1=(val10,...,vall3), val2=(val20, ..., val23)/*则r0 := (val10 < 0) ? 0 : ((val10 > 0xffff) ? 0xffff : val10)r1 := (val11 < 0) ? 0 : ((val11 > 0xffff) ? 0xffff : val11)r2 := (val12 < 0) ? 0 : ((val12 > 0xffff) ? 0xffff : val12)r3 := (val13 < 0) ? 0 : ((val13 > 0xffff) ? 0xffff : val13)r4 := (val20 < 0) ? 0 : ((val20 > 0xffff) ? 0xffff : val20)r5 := (val21 < 0) ? 0 : ((val21 > 0xffff) ? 0xffff : val21)r6 := (val22 < 0) ? 0 : ((val22 > 0xffff) ? 0xffff : val22)r7 := (val23 < 0) ? 0 : ((val23 > 0xffff) ? 0xffff : val23)*/extern __m128i _mm_packus_epi32(__m128i val1, __m128i val2);/*Sum absolute 8-bit integer difference of adjacent groups of 4 byteintegers in operands. Starting offsets within operands aredetermined by mask *///s1, s2: sixteen 8-bit unsigned integers// msk0, msk1, and msk2 are the three least significant bits of parameter msk/*则i = msk2 * 4j = msk0-1 * 4for (k = 0; k < 8; k = k + 1) {t0 = abs(s1[i + k + 0] - s2[j + 0])t1 = abs(s1[i + k + 1] - s2[j + 1])t2 = abs(s1[i + k + 2] - s2[j + 2])t3 = abs(s1[i + k + 3] - s2[j + 3])r[k] = t0 + t1 + t2 + t3}*/extern __m128i _mm_mpsadbw_epu8(__m128i s1, __m128i s2, const int msk);/** Load double quadword using non-temporal aligned hint*///This instruction loads data from a specified address.The memory source must be //16-byte aligned because the return value consists of sixteen bytes.则r=*v1extern __m128i _mm_stream_load_si128(__m128i* v1);

nmmintrin.h为SSE4.2头文件,其文件中各函数的介绍:

	/** Intrinsics for text/string processing.*///Either the computed mask of MaxSize bits or its expansion to a 128-bit parameter.//If the return value is expanded, each bit of the result mask is expanded to a //byte or a word.详见参考文献2extern __m128i _mm_cmpistrm (__m128i a, __m128i b, const int mode);//An integer between 0 and Maxsize. MaxSize when the computed mask equals 0.//Otherwise, the index of the leftmost or rightmost bit set to 1 in this mask.//详见参考文献2extern int     _mm_cmpistri (__m128i a, __m128i b, const int mode);//Either the computed mask of MaxSize bits or its expansion to a 128-bit parameter.//If the return value is expanded, each bit of the result mask is expanded to //a byte or a word.详见参考文献3extern __m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int mode);//An integer that ranges between 0 and MaxSize. Maxsize is returned when the //resulting bitmask is equal to 0. Otherwise, the index of either the leftmost//or rightmost bit set to 1 in this mask.详见参考文献3extern int     _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int mode);/** Intrinsics for text/string processing and reading values of EFlags.*///Returns one if the null character occurs in b. Otherwise, zero. When one is //returned, it means that b contains the ending fragment of the string that is //being compared.详见参考文献2extern int     _mm_cmpistrz (__m128i a, __m128i b, const int mode);//Zero if the resulting mask is equal to zero. Otherwise, one.//详见参考文献2extern int     _mm_cmpistrc (__m128i a, __m128i b, const int mode);//One if the null character occurs in a. Otherwise, zero. When one is returned,//it means that a contains the ending fragment of the string that is being compared.//详见参考文献2extern int     _mm_cmpistrs (__m128i a, __m128i b, const int mode);//bit0 of the resulting bitmask.详见参考文献2extern int     _mm_cmpistro (__m128i a, __m128i b, const int mode);//One if b is does not contain the null character and the resulting mask is //equal to zero. Otherwise, zero. 详见参考文献2extern int     _mm_cmpistra (__m128i a, __m128i b, const int mode);//One if the absolute value of lb is less than MaxSize. Otherwise, zero.详见参考文献3extern int     _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int mode);//Zero if the resulting mask is equal to zero. Otherwise, one.详见参考文献3extern int     _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int mode);//One if the absolute value of la is less than MaxSize. Otherwise, zero.详见参考文献3extern int     _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int mode);//bit0 of the resulting bitmask. 详见参考文献3extern int     _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int mode);//One if the absolute value of lb is larger than or equal to MaxSize and the //resulting mask is equal to zero. Otherwise, zero.详见参考文献3extern int     _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int mode);/** Packed integer 64-bit comparison, zeroing or filling with ones* corresponding parts of result*///val1=(val10, val11), val2=(val20, val21)//则,r0 = (val10 > val20) ? 0xffffffffffffffff : 0x0//	 r1 = (val11 > val21) ? 0xffffffffffffffff : 0x0extern __m128i _mm_cmpgt_epi64(__m128i val1, __m128i val2);/** Calculate a number of bits set to 1*///The number of bits set to one in vextern int _mm_popcnt_u32(unsigned int v);//The number of bits set to one in vextern __int64 _mm_popcnt_u64(unsigned __int64 v);/** Accumulate CRC32 (polynomial 0x11EDC6F41) value*///crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,//r = crc + CRC-32C(v)extern unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v);//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,//r = crc + CRC-32C(v)extern unsigned int _mm_crc32_u16(unsigned int crc, unsigned short v);//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,//r = crc + CRC-32C(v)extern unsigned int _mm_crc32_u32(unsigned int crc, unsigned int v);//crc:循环冗余校验码,CRC32-C algorithm is based on polynomial 0x1EDC6F41,//r = crc + CRC-32C(v)extern unsigned __int64 _mm_crc32_u64(unsigned __int64 crc, unsigned __int64 v);

参考文献:
1、http://msdn.microsoft.com/zh-cn/library/bb514044(v=vs.100).aspx
2、http://msdn.microsoft.com/zh-cn/library/bb513993(v=vs.100).aspx
3、http://msdn.microsoft.com/zh-cn/library/bb514048(v=vs.100).aspx

相关文章:

Nacos v0.7.0:对接CMDB,实现基于标签的服务发现能力

Nacos近期发布了0.7.0版本&#xff0c;该版本支持对接第三方CMDB获取CMDB数据、使用Selector机制来配置服务的路由类型、支持单机模式使用MySQL数据库、上线Node.js客户端&#xff0c;并修复了一些bug。对接CMDB实现就近访问在服务进行多机房或者多地域部署时&#xff0c;跨地域…

数十篇推荐系统论文被批无法复现:源码、数据集均缺失,性能难达预期

作者 | Maurizio Ferrari Dacrema译者 | 凯隐责编 | Jane出品 | AI科技大本营&#xff08;ID: rgznai100&#xff09;【导读】来自意大利米兰理工大学的 Maurizio 团队近日发表了一篇极具批判性的文章&#xff0c;剑指推荐系统领域的其他数十篇论文&#xff0c;指出这些论文中基…

crontab 总结

2019独角兽企业重金招聘Python工程师标准>>> 1.写法 每三天执行一次&#xff1a;0 0 */3 * * root command&#xff0c;注意&#xff1a;* * */3 * * root command 这样写是不对的。其它每N小时执行一次也类似 &#xff08;后续补充&#xff09; 转载于:https://…

ubuntu安装thrift

ubuntu环境下安装thrift-0.10.0 1.解压 2.编译安装 ./configure -with-cpp -with-boost -without-python -without-csharp -with-java -without-erlang -without-perl -with-php -without-php_extension -without-ruby -without-haskell -without-go make sudo make install3.是…

AES(Advanced Encryption Standard) Intrinsics各函数介绍

AES为高级加密标准&#xff0c;是较流行的一种密码算法。 SIMD相关头文件包括&#xff1a; //#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#include <dvec.h>//SSE2(also include fvec.h)#include <mmintrin.h> //MMX #…

轻松应对Java试题,这是一份大数据分析工程师面试指南

作者 | HappyMint转载自大数据与人工智能&#xff08;ai-big-data&#xff09;导语&#xff1a;经过这一段时间与读者的互动与沟通&#xff0c;本文作者发现很多小伙伴会咨询面试相关的问题&#xff0c;特别是即将毕业的小伙伴&#xff0c;所以决定输出一系列面试相关的文章。本…

【Elasticsearch 5.6.12 源码】——【3】启动过程分析(下)...

版权声明&#xff1a;本文为博主原创&#xff0c;转载请注明出处&#xff01;简介 本文主要解决以下问题&#xff1a; 1、ES启动过程中的Node对象都初始化了那些服务&#xff1f;构造流程 Step 1、创建一个List暂存初始化失败时需要释放的资源&#xff0c;并使用临时的Logger对…

C++中的封装、继承、多态

封装(encapsulation)&#xff1a;就是将抽象得到的数据和行为(或功能)相结合&#xff0c;形成一个有机的整体&#xff0c;也就是将数据与操作数据的源代码进行有机的结合&#xff0c;形成”类”&#xff0c;其中数据和函数都是类的成员。封装的目的是增强安全性和简化编程&…

比尔盖茨护犊子 称iPad让大批用户沮丧

为什么80%的码农都做不了架构师&#xff1f;>>> 在5月6日接受美国CNBC电视台访问时&#xff0c;微软前任掌门人比尔盖茨维护了自家反响不那么好的Surface系列平板电脑&#xff0c;同时他还不忘吐槽了一把iPad。 当 谈到日渐颓败的PC市场时&#xff0c;盖茨称平板电…

小心陷阱:二维动态内存的不连续性

void new_test() {int** pp;pp new int*[10];for(int i0; i<10; i){pp[i] new int[10];}//pp[0], pp[1], ... , pp[9]在内存中连续;//a1 pp[0][0], pp[0][1], ... , pp[0][9]在内存中也是连续的;//a2 pp[1][0], pp[1][1], ... , pp[1][9]在内存中也是连续的;//...//a9 …

超酷炫!Facebook用深度学习和弱监督学习绘制全球精准道路图

作者 | Saikat Basu等译者 | 陆离责编 | 夕颜出品 | AI科技大本营&#xff08;ID: rgznai100&#xff09;导读&#xff1a;现如今&#xff0c;即使可以借助卫星图像和绘制软件&#xff0c;创建精确的道路图也依然是一个费时费力的人力加工过程。许多地区&#xff0c;特别是在发…

npm包发布记录

下雪了&#xff0c;在家闲着&#xff0c;不如写一个npm 包发布。简单的 npm 包的发布网上有很多教程&#xff0c;我就不记录了。这里记录下&#xff0c;一个复杂的 npm 包发布&#xff0c;复杂指的构建环境复杂。 整个工程使用 rollup 来构建&#xff0c;其中会引进 babel 来转…

设计模式之单例模式(Singleton)摘录

23种GOF设计模式一般分为三大类&#xff1a;创建型模式、结构型模式、行为模式。 创建型模式包括&#xff1a;1、FactoryMethod(工厂方法模式)&#xff1b;2、Abstract Factory(抽象工厂模式)&#xff1b;3、Singleton(单例模式)&#xff1b;4、Builder(建造者模式)&#xff1…

关于知识蒸馏,这三篇论文详解不可错过

作者 | 孟让转载自知乎导语&#xff1a;继《从Hinton开山之作开始&#xff0c;谈知识蒸馏的最新进展》之后&#xff0c;作者对知识蒸馏相关重要进行了更加全面的总结。在上一篇文章中主要介绍了attention transfer&#xff0c;FSP matrix和DarkRank&#xff0c;关注点在于寻找不…

设计模式之建造者模式(生成器模式、Builder)摘录

23种GOF设计模式一般分为三大类&#xff1a;创建型模式、结构型模式、行为模式。 创建型模式包括&#xff1a;1、FactoryMethod(工厂方法模式)&#xff1b;2、Abstract Factory(抽象工厂模式)&#xff1b;3、Singleton(单例模式)&#xff1b;4、Builder(建造者模式、生成器模式…

[置顶] webservice系列2---javabeanhandler

摘要&#xff1a;本节主要介绍以下两点&#xff0c;1.带javabean的webservice的开发和调用 2.handler的简单介绍及使用1.引言在之前的一篇博客webservice系列1---基于web工程上写一个基本数据类型的webservice中介绍了如何采用axis1.4来完成一个简单的webservice的开发流程(入参…

AI教育公司物灵科技完成战略融资,商汤科技投资

1月2日消息&#xff0c;从相关媒体报道&#xff0c;AI教育公司物灵科技近日完成了商汤的战略融资&#xff0c;本轮融资将用于产品迭代和扩大市场。 此前投资界曾报道&#xff0c;物灵科技已经获得1.5亿元Pre-A轮融资&#xff0c;当时具体资方未透露。 公开资料显示&#xff0…

Python之父发文,将重构现有核心解析器

原题 | PEG Parsers作者 | Guido van Rossum译者 | 豌豆花下猫转载自 Python猫&#xff08;ID: python_cat&#xff09; 导语&#xff1a;Guido van Rossum 是 Python 的创造者&#xff0c;虽然他现在放弃了“终身仁慈独裁者”的职位&#xff0c;但却成为了指导委员会的五位成员…

全面支持三大主流环境 |百度PaddlePaddle新增Windows环境支持

2019独角兽企业重金招聘Python工程师标准>>> PaddlePaddle作为国内首个深度学习框架&#xff0c;最近发布了更加强大的Fluid1.2版本, 增加了对windows环境的支持&#xff0c;全面支持了Linux、Mac、 windows三大环境。 PaddlePaddle在功能完备的基础上&#xff0c;也…

设计模式之原型模式(Prototype)摘录

23种GOF设计模式一般分为三大类&#xff1a;创建型模式、结构型模式、行为模式。 创建型模式包括&#xff1a;1、FactoryMethod(工厂方法模式)&#xff1b;2、Abstract Factory(抽象工厂模式)&#xff1b;3、Singleton(单例模式)&#xff1b;4、Builder(建造者模式、生成器模式…

NFS共享服务挂载时出现“access denied by server while mounting”的解决方法

笔者用的Linuxf发行版本为Centos6.4&#xff0c;以下方法理论上讲对于Fedora, Red Hat均有效&#xff1a; 搭建好NFS服务后&#xff0c;如果用以下的命令进行挂载&#xff1a; # mount -t nfs 172.16.12.140:/home/liangwode/test /mnt 出现如下错误提示&#xff1a; mount.nf…

设计模式之桥接模式(Bridge)摘录

23种GOF设计模式一般分为三大类&#xff1a;创建型模式、结构型模式、行为模式。 创建型模式包括&#xff1a;1、FactoryMethod(工厂方法模式)&#xff1b;2、Abstract Factory(抽象工厂模式)&#xff1b;3、Singleton(单例模式)&#xff1b;4、Builder(建造者模式、生成器模式…

原360首席科学家颜水成正式加入依图科技,任首席技术官

7 月 29 日&#xff0c;依图科技宣布原 360 首席科学家颜水成正式加入&#xff0c;担任依图科技首席技术官&#xff08;CTO&#xff09;一职。依图方面称&#xff0c;颜水成加入后将带领团队进一步夯实依图在人工智能基础理论和原创算法方面的技术优势&#xff0c;为依图在商业…

分布式存储fastdfs安装使用

1.下载地址https://github.com/happyfish100/fastdfshttps://github.com/happyfish100/fastdfs/wiki安装辅助说明文档2.安装编译环境yum install git gcc gcc-c make automake autoconf libtool pcre pcre-devel zlib zlib-devel openssl-devel wget vim -y三台主机&#xff1a…

Hibernate学习(九)———— 二级缓存和事务级别详讲

序言 这算是hibernate的最后一篇文章了&#xff0c;下一系列会讲解Struts2的东西&#xff0c;然后说完Struts2&#xff0c;在到Spring&#xff0c;然后在写一个SSH如何整合的案例。之后就会在去讲SSM&#xff0c;在之后我自己的个人博客应该也差不多可以做出来了。基本上先这样…

超详细中文预训练模型ERNIE使用指南

作者 | 高开远&#xff0c;上海交通大学&#xff0c;自然语言处理研究方向最近在工作上处理的都是中文语料&#xff0c;也尝试了一些最近放出来的预训练模型&#xff08;ERNIE&#xff0c;BERT-CHINESE&#xff0c;WWM-BERT-CHINESE&#xff09;&#xff0c;比对之后还是觉得百…

linux内核SMP负载均衡浅析

需求 在《linux进程调度浅析》一文中提到&#xff0c;在SMP&#xff08;对称多处理器&#xff09;环境下&#xff0c;每个CPU对应一个run_queue&#xff08;可执行队列&#xff09;。如果一个进程处于TASK_RUNNING状态&#xff08;可执行状态&#xff09;&#xff0c;则它…

结构体中最后一个成员为[0]或[1]长度数组(柔性数组成员)的用法

结构体中最后一个成员为[0]长度数组的用法&#xff1a;这是个广泛使用的常见技巧&#xff0c;常用来构成缓冲区。比起指针&#xff0c;用空数组有这样的优势&#xff1a;(1)、不需要初始化&#xff0c;数组名直接就是所在的偏移&#xff1b;(2)、不占任何空间&#xff0c;指针需…

超全!深度学习在计算机视觉领域的应用一览

作者 | 黄浴&#xff0c;奇点汽车美研中心首席科学家兼总裁转载自知乎简单回顾的话&#xff0c;2006年Geoffrey Hinton的论文点燃了“这把火”&#xff0c;现在已经有不少人开始泼“冷水”了&#xff0c;主要是AI泡沫太大&#xff0c;而且深度学习不是包治百病的药方。计算机视…

SHAREPOINT2010数据库升级2013

在作TEST-SPCONTENT命令时&#xff0c;会提示认证方式不一样。 The [SharePoint - 80] web application is configured with claims authentication mode however the content database you are trying to attach is intended to be used against a windows classic authentic…