SSE指令集加速运算 | Nicolas的博客

先上代码：

/*g++ -msse2 main.cpp -lrt*/

#include <iostream>
#include <xmmintrin.h>//SSE指令集需包含词头文件
#include <time.h>
using namespace std;

#define N 120
int main() {

    struct timespec tpstart,tpend;
    clock_gettime(CLOCK_MONOTONIC, &tpstart);
    /////////////////do something
        {

        __m128 *p1,*p2,*p3;//__m128是一个长128位的数据类型，存放在寄存器中
        __attribute(aligned(128)) float f1[N],f2[N],f3[N];//新建一些浮点型数组
        /*__attribute(aligned(128))强制使编译器在给f1等分配
        内存空间时对齐在128位（8字节）上*/
        //cout<<f1<<"t"<<(int)f1%128<<endl;
        /*运行此条语句可以看出地址f1总是128的倍数，在内存上即
        是总存在一行上，使得总线取值时能够一次取完*/
        for(int i=0; i<N; i++) {
            f1[i]=i+0.12;
            f2[i]=i+0.16;
            }
        for(int time=0; time<10000; time++) {
            p1=(__m128*)f1;
            p2=(__m128*)f2;
            p3=(__m128*)f3;
            for(int i=0; i<N; i+=4) {
                *p3=_mm_mul_ps(*p1,*p2);//此函数封装了一些汇编原语，使得可以同时计算4个值的乘法
                p3++;
                p2++;
                p1++;
                }
            /*注意N的取值，N取为4的整数倍，使得最后一次调
            用SSE指令时刚好能全部处理完。否则容易引发段错误。
            所以，即使不能使用完，也一定要申请4的整数倍空间*/
            }
        }
    /////////////////done
    clock_gettime(CLOCK_MONOTONIC, &tpend);
    long timedif = (tpend.tv_sec-tpstart.tv_sec)*1000*1000+(tpend.tv_nsec-tpstart.tv_nsec)/1000;
    cout<<"SSE:t"<<timedif<<endl;

/////////////////////////不采用SSE指令/////////////////////////////////////


    clock_gettime(CLOCK_MONOTONIC, &tpstart);
    /////////////////do something
        {

        float f1[N],f2[N],f3[N];
        for(int i=0; i<N; i++) {
            f1[i]=i+0.12;
            f2[i]=i+0.16;
            }

        for(int time=0; time<10000; time++) {
            for(int i=0; i<N; i++) {
                f3[i]=f1[i]*f2[i];
                }
            }
        }
    /////////////////done
    clock_gettime(CLOCK_MONOTONIC, &tpend);
    timedif = (tpend.tv_sec-tpstart.tv_sec)*1000*1000+(tpend.tv_nsec-tpstart.tv_nsec)/1000;
    cout<<"noSSE:t"<<timedif<<endl;

    return 0;
    }

注意计时函数需要包含的头文件和编译选项，clock_gettime函数能精确到纳秒。

运行结果为：

SSE:    4929
noSSE:    9980

以微秒为单位，平均提高约两倍。

下面是另一份代码，演示如何动态申请空间，注意内存对齐：

#include <xmmintrin.h>
#include <iostream>
using namespace std;
const int N=120;

int main()
{
    __m128 *p1,*p2,*p3;
    __attribute(aligned(128)) float *pf1,*pf2,*pf3;
    pf1=(float *)_mm_malloc(sizeof(float)*N,128);
    pf2=(float *)_mm_malloc(sizeof(float)*N,128);
    pf3=(float *)_mm_malloc(sizeof(float)*N,128);
    cout<<(int)pf1%128<<endl;
    cout<<(int)pf2%128<<endl;
    cout<<(int)pf3%128<<endl;

    for(int i=0;i<N;i++){
        pf1[i]=i+0.12;
        pf2[i]=i+0.16;
    };
    p1=(__m128*)pf1;
    p2=(__m128*)pf2;
    p3=(__m128*)pf3;
    for(int i=0;i<N;i+=4){
        *p3=_mm_add_ps(*p1,*p2);
        p3++;
        p2++;
        p1++;
    }
    _mm_free(pf1);
    _mm_free(pf2);
    _mm_free(pf3);
    return 0;
}

Linux 34

Ubuntu 9

hg 1

git 2

GSL 1

SSH 4

TeX 1

Code::Blocks 1

Makefile 2

Matlab 3

花生壳 1

grub 1

stardict 1

Windows 3

VirtualBox 1

R 18

Kubuntu 2

Telnet 1

C++ 7

SSE 1

VPN 1

pptpd 1

WordPress 2

Firefox 1

Shell 4

C 1

printf 1

ggplot 2

cygwin 1

cgroup 1

CUDA 3

NTP 1

Server 1

VNC 1

qtiplot 1

mime 1

openbox 1

Grub 1

RCurl 1

IPV6 1

不务正业 1

cgroups 1

ulimit 1

Freetype 1

Gnuplot 2

LibGD 1

GCC 1

libtorrent 1

rtorrent 1

OS 2

awk 1

tips 3

Project Euler 11

Nvidia 1

QT 1

VS2010 1

phpcloud 1

RSS 1

TTRSS 1

crontab 1

alsa 1

capture 1

autofs 1

NIS 1

Samba 1

Apache 1

texmacs 1

ifuse 1

Ipod 1

rsync 1

Jekyll 1

Markdown 2

C/C++ 2

Vim 1

Vimperator 1

firefox 1

Openbox 1

Ubuntu server 1

Obuntu 1

Ipython 1