什么是OpenCL?
OpenCL是一组公开的代码API,用于规范异构计算。在这里我们指GPU并行计算。
参考: OpenCL 2.0 异构计算 [第三版] (Heterogeneous Computing with OpenCL 2.0)
如何实现异构计算?
以我浅薄的理解:一般计算需要一下几个要素,代码,内存,计算核心,异构计算需要用一套代码,在不同类型内存,不同计算核心上运行,因此对代码计算核心和内存的抽象是必要的。
我们考虑如下几个问题:
- 如何描述不同的内存
- 如何描述不同的计算核心
- 如何描述内存之间的相互作用
- 如何描述计算核心之间的相互作用
- 如何描述内存与计算核心之间的相互作用
如何描述不同的内存
clCreateBuffer可开辟OpenCL中特有的连续数组内存
clCreateBuffer(
cl_context context,
cl_mem_flags flags,
size_t size,
void *host_ptr,
cl_int *errcode_ret)
存在一个用于管理开辟的内存的集合,即上下文(context), 开辟的内存有可设置的读写属性,内存实际位置属性,有可设置的初始内容属性
可选的读写属性:CL_MEM_READ_ONLY,CL_MEM_WRITE_ONLY,CL_MEM_READ_WRITE
可选的实际位置属性:(CL_MEM_USE_HOST_PTR):要求使用主机内存,(CL_MEM_COPY_HOST_PTR,CL_MEM_ALLOC_HOST_PTR):不要求使用主机内存,
当然,还有主机上的一般内存,不由OpenCL规定
此外还有些核代码中的关键词用于声明内存类型:__constant,__global,__local
如何描述不同的计算核心
OpenCL计算核心可分为主机(host),设备(device),其中设备还可以分为多个不同的工作核心,每个设备中的工作核心有唯一表示get_global_id(0),只能通过这一表示区别不同核心?
每个核心上可以并行执行核(kernel)代码
如何描述内存之间的相互作用
clEnqueueWriteBuffer,clEnqueueReadBuffer用于实现数据的转移,如从OpenCL管理的内存与一般主机上内存之间的数据交换。
cl_int
clEnqueueWriteBuffer(
cl_command_queue command_queue,
cl_mem buffer,//OpenCL内存
cl_bool blocking_write,//同步与异步数据传输
size_t offset,
size_t cb,
const void *ptr,//host内存
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event)
如何描述计算核心之间的相互作用
主机计算核心与设备计算核心之间存在通信,主要是主机下达命令,设备接受,返回结果,附带一些参数的传递,参数的传递依靠上下文。
这种通信抽象为命令队列,
cl_command_queue
clCreateCommandQueueWithProperties(
cl_context context,
cl_device_id device,
cl_command_queue_properties peoperties,
cl_int *errcode_ret)
可通过clEnqueueReadBuffer,clEnqueueNDRangeKernel在队列中插入命令
如何描述内存与计算核心之间的相互作用
核代码通过关键字,__constant,__global,__local,创建或者读取写入相应内存,这些不同关键字标识的内存有不同的限制,如访问速度,最大内存大小,能否在不同核心之间共享
网格上的ising模型
利用Monte Carlo方法模拟ising模型中自旋的翻转,这里强调OpenCL代码实现。
//easugpu-2.h,一个简化opencl调用的个人头文件
#include <iostream>
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#define aaa(c) c
//gpu初始化,建立上下文,命令队列等
#define gpu_buile(name) \
cl_int name##_CL_err; /*Looking up the available GPUs*/ \
cl_platform_id name##_platforms[2]; \
cl_uint name##_num = 0; \
name##_CL_err = clGetPlatformIDs(2, name##_platforms, (cl_uint *)&name##_num); \
std::cout << name##_CL_err<< "\t->clGetPlatformIDs" << std::endl; \
cl_device_id name##_devices[name##_num]; \
\
name##_CL_err = clGetDeviceIDs(name##_platforms[0], CL_DEVICE_TYPE_GPU, name##_num, name##_devices, NULL); \
if (name##_CL_err == CL_SUCCESS) \
{ \
std::cout << name##_num<< "\t->find" << std::endl; \
} \
else \
{ \
std::cout << name##_CL_err << "\t->clGetDeviceIDs"<< std::endl; \
} \
cl_context_properties name##_properties[] = {CL_CONTEXT_PLATFORM, (long int)name##_platforms[0], 0}; \
cl_context name##_context = clCreateContextFromType(name##_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &name##_CL_err); /*create a command queue*/ \
std::cout << name##_CL_err << "\t->clCreateContextFromType"<< std::endl; \
cl_command_queue name##_queue = clCreateCommandQueueWithProperties(name##_context, name##_devices[0], 0, NULL)
//定义只读内存
#define defreadbuff(name, len, adrr) clCreateBuffer(name##_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, len, adrr, NULL)
//定义读写内存
#define defreadwritebuff(name, len, adrr) clCreateBuffer(name##_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, len, adrr, NULL)
//生成核函数
#define getKernel(name, kernelname) \
cl_int name##_errcode_ret; \
cl_program name##_program = clCreateProgramWithSource(name##_context, name##_num, (const char **)&name##_KernelSource, NULL, &name##_errcode_ret); \
std::cout<<name##_errcode_ret<<"\t->clCreateProgramWithSource"<<std::endl;\
cl_int name##_status; \
name##_status = clBuildProgram(name##_program, name##_num, name##_devices, NULL, NULL, NULL); \
std::cout << name##_status << "\t->clBuildProgram" << std::endl; \
if (name##_status < 0) \
{ \
size_t name##_len = 0; \
cl_int name##_ret = CL_SUCCESS; \
name##_ret = clGetProgramBuildInfo(name##_program, name##_devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &name##_len); \
char *name##_buffer = (char *)calloc(name##_len, sizeof(char)); \
name##_ret = clGetProgramBuildInfo(name##_program, name##_devices[0], CL_PROGRAM_BUILD_LOG, name##_len, name##_buffer, NULL); \
std::cout << "faile:\n " << name##_buffer << std::endl; \
} \
else \
{ \
std::cout << "Build Success" << std::endl; \
} \
cl_kernel name##_kernel = clCreateKernel(name##_program, #kernelname /*函数名*/, NULL)
//设置核函数参数
#define setKernelArg(name, pos, inputBuffer) \
clSetKernelArg(name##_kernel, pos, sizeof(cl_mem), (void *)&inputBuffer)
//运行核函数
#define runker(name, global_size) \
size_t name##_global_work_size[1] = {(size_t)global_size}; \
size_t name##_local_work_size[1] = {64}; \
size_t name##_offset[1] = {0}; \
clEnqueueNDRangeKernel(name##_queue, name##_kernel, 1, name##_offset, name##_global_work_size, name##_local_work_size, 0, NULL, NULL)
//从OpenCL内存数据转移到到一般内存
#define readdata(name, frombuff, size, to) \
name##_status = clEnqueueReadBuffer(name##_queue, frombuff, CL_TRUE, 0, size, to, 0, NULL, NULL)
//主程序
// g++ is_model_2D_square.cpp -lOpenCL
#include "easugpu-2.h"
#include <fstream>
#include <vector>
#include <sstream>
#include <random>
#define nbern 10
int looplen = 1000;
int tiplen = 1000;
int globalsize = 1024;
struct point
{
int from;
int updown;
int to[nbern] = {0};
};
struct parameter
{
float temperature;
int looplen;
int plen;
int over;
};
int main(int argc, const char *argv[])
{
auto outf="out.txt";
auto inf="graph.txt";
int bigloop=40;
double intt=0;
if(argc>=4){
inf=argv[1];
//输入文件名
outf=argv[2];
//输出文件名
bigloop=std::stoi(argv[3]);
//几条相变曲线
}
if(argc>=5){
//指定温度,此时bigloop失效,输出fram.txt文件为此温度下的场
//fram.txt 位置id 状态
intt=std::stod(argv[4]);
}
std::cout <<"inf:"<< inf << std::endl;
std::cout <<"outf:"<< outf << std::endl;
std::cout <<"bigloop:"<< bigloop << std::endl;
std::cout <<"intt:"<< intt << std::endl;
const char *ismod_KernelSource =
(
#include "is_model_gpu.cl"
);
gpu_buile(ismod);
std::ifstream fin;
fin.open(inf, std::ios::in);
if (!fin.is_open())
{
std::cerr << "cannot open the file"<<std::endl;
return 0;
}
char line[1024] = {0};
std::vector<point> pointlist;
while (fin.getline(line, sizeof(line)))
{
//定义局部变量
point p;
//从“行”中提取“单词”
std::stringstream word(line);
word >> p.from;
word >> p.updown;
int num;
for (int ii = 0; ii < nbern; ii++)
{
word >> p.to[ii];
}
pointlist.push_back(p);
}
srand(time(0));
int E0 = 0;
int E1 = 0;
int plen = pointlist.size();
float temperature = 0.1;
float j = 1;
std::cout << "plen:" << plen << std::endl;
parameter par;
par.looplen = looplen;
par.plen = pointlist.size();
par.temperature = 1;
par.over = 0;
getKernel(ismod, sum);
double meanover = 0;
std::vector<int> ciju;
int maxloop=200;
std::vector<point> cpp(pointlist);
// std::mt19937 rng(std::random_device{}());
std::mt19937 rnd(time(nullptr));
uint randlist[looplen * globalsize * 2] = {0};
for(int ppi=0;ppi<bigloop;ppi++){
for (par.temperature = 1; par.temperature < 4; par.temperature = par.temperature + 0.1)
{
if(intt>0){
par.temperature=intt;
intt=-1;
}else if (intt<0)
{
break;
}
for (int ii = 0; ii < maxloop; ii++)
{
std::cout << "jj:" << ii<<"/"<<par.temperature<<"/"<<ppi << std::endl;
auto inputBuffer = defreadwritebuff(ismod, sizeof(point) * pointlist.size(), &pointlist[0]);
for (int i = 0; i < looplen * globalsize * 2; i++)
{
randlist[i] =(rnd()>>2);
// std::cout<< randlist[i]<<std::endl;
}
// std::cout<<"hhh"<<std::endl;
auto inputBuffer_randlist = defreadbuff(ismod, sizeof(randlist), &randlist[0]);
auto parbuff = defreadwritebuff(ismod, sizeof(parameter), &par);
// std::cout<<"hhh"<<std::endl;
setKernelArg(ismod, 0, inputBuffer);
setKernelArg(ismod, 1, inputBuffer_randlist);
setKernelArg(ismod, 2, parbuff);
// std::cout<<"hhh3"<<std::endl;
runker(ismod, globalsize);
// std::cout<<"hhh"<<std::endl;
readdata(ismod, inputBuffer, sizeof(point) * pointlist.size(), &pointlist[0]);
readdata(ismod, parbuff, sizeof(parameter), &par);
meanover = meanover / 8 * 7. + par.over / 8.;
std::cout << "mean:" << (meanover - par.over) / meanover << "/" << meanover << "/" << par.over << std::endl;
clReleaseMemObject(inputBuffer); // Release mem object.
clReleaseMemObject(inputBuffer_randlist); // Release mem object.
clReleaseMemObject(parbuff); // Release mem object.
if (ii>50 &&(abs(meanover - par.over) < 0.01 * meanover || abs(meanover - par.over) < 0.001))
{
int total = 0;
for (auto iii : pointlist)
{
total = total + iii.updown;
}
ciju.push_back(total);
std::cout << "total" << total << std::endl;
break;
}
if(ii==maxloop-1){
int total = 0;
for (auto iii : pointlist)
{
total = total + iii.updown;
}
ciju.push_back(total);
std::cout << "total" << total << std::endl;
}
for(int i=0;i<cpp.size();i++){
pointlist[i]=cpp[i];
}
}
}
}
if (intt<0)
{
std::ofstream fout;
fout.open("fram.txt", std::ios::out);
// for (auto pp : pointlist)
// {
// fout << pp.from << " " << pp.updown << std::endl;
// }
for (auto pp : pointlist)
{
fout << pp.from<<" "<<pp.updown << std::endl;
}
fout.close();
}
std::ofstream fout;
fout.open(outf, std::ios::out);
// for (auto pp : pointlist)
// {
// fout << pp.from << " " << pp.updown << std::endl;
// }
for (auto pp : ciju)
{
fout << pp << std::endl;
}
fout.close();
fin.close();
}