43 #include <ACG/GL/acg_glew.hh>
47 #include <ACG/GL/ShaderCache.hh>
48 #include <ACG/ShaderUtils/GLSLShader.hh>
49 #include <ACG/GL/GLFormatInfo.hh>
53 #define SAT_DBG_DIR "c:/dbg/sat_out/"
65 std::map<GLenum, const char*> PrefixSumPlan::datatypeMacros_;
67 PrefixSumPlan::PrefixSumPlan(
int _w,
int _h, GLenum _internalFmt,
int _blocksize)
68 : width_(_w), height_(_h), blocksize_(_blocksize), internalFmt_(_internalFmt), elemSize_(0),
70 dbgOutput_(0), dbgTranposedInput_(0), dbgProfile_(0)
73 _blocksize = paddedBlocksize(_blocksize);
78 numWorkGroupsX_ = _w / ( _blocksize) + (_w % _blocksize ? 1 : 0);
81 numWorkGroupsX_ = std::max(numWorkGroupsX_, 1);
82 numWorkGroupsY_ = std::max(numWorkGroupsY_, 1);
84 numBlockScanGroupsX_ = numWorkGroupsX_ / 2 + numWorkGroupsX_ % 2;
85 numBlockScanGroupsY_ = numWorkGroupsY_ / 2 + numWorkGroupsY_ % 2;
87 numBlockScanGroupsX_ = std::max(numBlockScanGroupsX_, 1);
88 numBlockScanGroupsY_ = std::max(numBlockScanGroupsY_, 1);
95 blocksizeDef.sprintf(
"#define SAT_BLOCKSIZE %i", _blocksize);
96 macros_.push_back(blocksizeDef);
99 macros_.push_back(
"#define SAT_2D");
101 if (datatypeMacros_.empty())
103 datatypeMacros_[GL_R32F] =
"SAT_FLOAT1";
104 datatypeMacros_[GL_RG32F] =
"SAT_FLOAT2";
105 datatypeMacros_[GL_RGBA32F] =
"SAT_FLOAT4";
107 datatypeMacros_[GL_R32I] =
"SAT_INT1";
108 datatypeMacros_[GL_RG32I] =
"SAT_INT2";
109 datatypeMacros_[GL_RGBA32I] =
"SAT_INT4";
111 datatypeMacros_[GL_R32UI] =
"SAT_UINT1";
112 datatypeMacros_[GL_RG32UI] =
"SAT_UINT2";
113 datatypeMacros_[GL_RGBA32UI] =
"SAT_UINT4";
116 std::map<GLenum, const char*>::iterator datatype = datatypeMacros_.find(_internalFmt);
118 if (datatype != datatypeMacros_.end())
120 macros_.push_back(QString(
"#define SAT_DATATYPE ") + QString(datatype->second));
122 bool success =
false;
123 elemSize_ = 4 * QString(datatype->second).right(1).toInt(&success);
126 std::cout <<
"SATPlan: failed to get size of format " << datatype->second << std::endl;
129 std::cout <<
"SATPlan: unsupported texture format " << _internalFmt << std::endl;
133 if (numWorkGroupsX_ > 1)
135 macros_.push_back(
"#define SAT_BLOCKSCANOUT");
136 blockSumPlan_ =
new PrefixSumPlan(numWorkGroupsX_, height_, _internalFmt, _blocksize);
140 PrefixSumPlan::~PrefixSumPlan()
144 int PrefixSumPlan::paddedDimension(
int _dim)
const
146 int padding = _dim % (2*blocksize_);
148 return _dim + 2*blocksize_ - padding;
152 int PrefixSumPlan::paddedBlocksize(
int _size )
const
155 while (i < _size) i <<= 1;
168 perfCounter_.restart();
170 _src->bindAsImage(0, GL_READ_ONLY);
171 _dst->bindAsImage(1, GL_WRITE_ONLY);
173 if (numWorkGroupsX_ > 1)
175 if (!blockSums_.is_valid())
177 blockSums_.setBufferData(numBlockScanGroupsX_ * 2 * elemSize_, 0, internalFmt_);
178 blockSumsOut_.setBufferData(numBlockScanGroupsX_ * 2 * elemSize_, 0, internalFmt_);
180 blockSums_.bindAsImage(2, GL_WRITE_ONLY);
186 glDispatchCompute(numWorkGroupsX_,numWorkGroupsY_,1);
187 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
190 std::cout <<
"psum: " << perfCounter_.
elapsedMs() << std::endl;
194 ACG::GLDebug::dumpBufferData(GL_TEXTURE_BUFFER, _dst->getBufferId(), SAT_DBG_DIR
"1d_pass1.bin");
197 if (numWorkGroupsX_ > 1)
202 perfCounter_.restart();
204 blockSumPlan_->execute(&blockSums_, &blockSumsOut_);
207 std::cout <<
"psum-blockscan: " << perfCounter_.
elapsedMs() << std::endl;
216 perfCounter_.restart();
218 _dst->bindAsImage(0, GL_READ_WRITE);
219 blockSumsOut_.bindAsImage(1, GL_READ_ONLY);
221 satCSBlockMerge->
use();
222 glDispatchCompute(numWorkGroupsX_-1,1,1);
223 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
226 std::cout <<
"psum-blockmerge: " << perfCounter_.
elapsedMs() << std::endl;
238 ACG::GLDebug::dumpBufferData(GL_TEXTURE_BUFFER, _dst->getBufferId(), SAT_DBG_DIR
"1d_act.bin");
239 if (blockSumsOut_.is_valid())
240 ACG::GLDebug::dumpBufferData(GL_TEXTURE_BUFFER, blockSumsOut_.getBufferId(), SAT_DBG_DIR
"1d_bsum.bin");
251 bool success =
false;
257 ACG::GLDebug::dumpTexture2D(_src->id(), 0, _src->getFormat(), _src->getType(), elemSize_ * _src->getWidth() * _src->getHeight(), QString(QString(SAT_DBG_DIR
"2d_") + QString(dbgTranposedInput_ ?
"cols_" :
"rows_") + QString(
"input.bin")).toLatin1(),
true);
265 perfCounter_.restart();
267 _src->bindAsImage(0, GL_READ_ONLY);
268 _dst->bindAsImage(1, GL_WRITE_ONLY);
270 if (numWorkGroupsX_ > 1)
272 if (!blockSums2D_.is_valid())
274 blockSums2D_.setStorage(1, internalFmt_, 2 * numBlockScanGroupsX_, height_);
275 blockSums2D_.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
276 blockSums2D_.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
277 blockSums2DOut_.setStorage(1, internalFmt_, 2 * numBlockScanGroupsX_, height_);
278 blockSums2DOut_.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
279 blockSums2DOut_.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
282 blockSums2D_.bindAsImage(2, GL_WRITE_ONLY);
287 glDispatchCompute(numWorkGroupsX_,height_,1);
288 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
291 std::cout <<
"psum-blockscan: " << perfCounter_.
elapsedMs() << std::endl;
295 ACG::GLDebug::dumpTexture2D(_dst->id(), 0, _dst->getFormat(), _dst->getType(), elemSize_ * _dst->getWidth() * _dst->getHeight(), (QString(SAT_DBG_DIR
"2d_pass1") + QString(dbgTranposedInput_?
"_cols" :
"_rows") + QString(
".bin")).toLatin1(),
true);
298 if (numWorkGroupsX_ > 1)
305 perfCounter_.restart();
307 blockSumPlan_->execute(&blockSums2D_, &blockSums2DOut_);
310 std::cout <<
"psum-blockscan: " << perfCounter_.
elapsedMs() << std::endl;
320 perfCounter_.restart();
322 _dst->bindAsImage(0, GL_READ_WRITE);
323 blockSums2DOut_.bindAsImage(1, GL_READ_ONLY);
325 satCSBlockMerge->
use();
326 glDispatchCompute(numWorkGroupsX_-1,height_,1);
327 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
330 std::cout <<
"psum-blockmerge: " << perfCounter_.
elapsedMs() << std::endl;
344 ACG::GLDebug::dumpTexture2D(_dst->id(), 0, _dst->getFormat(), _dst->getType(), elemSize_ * _dst->getWidth() * _dst->getHeight(), QString(QString(SAT_DBG_DIR
"2d_") + QString(dbgTranposedInput_?
"_cols" :
"_rows") + QString(
".bin")).toLatin1(),
true);
345 if (blockSums2DOut_.is_valid())
346 ACG::GLDebug::dumpTexture2D(blockSums2DOut_.id(), 0, blockSums2DOut_.getFormat(), blockSums2DOut_.getType(), elemSize_ * width_ * height_, QString(QString(SAT_DBG_DIR
"2d_") + QString(dbgTranposedInput_?
"_cols" :
"_rows") + QString(
"_bsum.bin")).toLatin1(),
true);
358 bool success =
false;
366 _src->bindAsImage(0, GL_READ_ONLY);
368 int padHeight = height_;
369 if (_padWidthAndHeight)
370 padHeight = paddedDimension(padHeight);
372 if (!_dst->is_valid())
374 _dst->setStorage(1, internalFmt_, width_, padHeight);
375 _dst->parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
376 _dst->parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
379 _dst->bindAsImage(1, GL_WRITE_ONLY);
381 glDispatchCompute(numWorkGroupsX_, padHeight / blocksize_, 1);
392 SATPlan::SATPlan(
int _w,
int _h, GLenum _internalFmt ,
int _blocksize )
393 : rows_(0), cols_(0), paddingRequired_(false), transposeGroupSize_(0)
398 paddingRequired_ = _w != rows_->width() || _h != cols_->width();
400 rows_->debugSetTransposedInput(0);
401 cols_->debugSetTransposedInput(1);
403 transposeMacros_ = rows_->macros();
406 transposeGroupSize_ = _blocksize;
407 int maxInvocations = GLSL::ComputeShader::caps().maxWorkGroupInvocations_;
409 if (transposeGroupSize_*transposeGroupSize_ > maxInvocations)
411 transposeGroupSize_ = int(sqrtf(
float(maxInvocations)));
413 QString cappedBlocksize;
414 cappedBlocksize.sprintf(
"#define SAT_BLOCKSIZE %d", transposeGroupSize_);
416 transposeMacros_.push_back(
"#undef SAT_BLOCKSIZE");
417 transposeMacros_.push_back(cappedBlocksize);
433 if (rows_->profilingEnabled())
434 perfCounter_.restart();
436 _src->bindAsImage(0, GL_READ_ONLY);
437 _dst->bindAsImage(1, GL_WRITE_ONLY);
439 int w = _src->getWidth();
440 int h = _src->getHeight();
441 int blocksize = transposeGroupSize_;
443 int numGroupsX = w / blocksize + (w % blocksize ? 1 : 0);
444 int numGroupsY = h / blocksize + (h % blocksize ? 1 : 0);
447 glDispatchCompute(numGroupsX, numGroupsY, 1);
448 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
450 if (rows_->profilingEnabled())
451 std::cout <<
"transpose: " << perfCounter_.
elapsedMs() << std::endl;
454 return transposeCS != 0;
462 if (paddingRequired_)
464 if (rows_->profilingEnabled())
465 perfCounter_.restart();
467 if (!paddedInput_.is_valid())
469 paddedInput_.setStorage(1, _src->getInternalFormat(), rows_->width(), cols_->width());
470 paddedInput_.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
471 paddedInput_.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
473 rows_->padInput(_src, &paddedInput_,
true);
475 if (rows_->profilingEnabled())
476 std::cout <<
"padding: " << perfCounter_.
elapsedMs() << std::endl;
480 if (rows_->profilingEnabled())
481 perfCounter_.restart();
484 bool success = rows_->execute(paddingRequired_ ? &paddedInput_ : _src, _dst);
486 if (rows_->profilingEnabled())
487 std::cout <<
"rows: " << perfCounter_.
elapsedMs() << std::endl;
492 if (!transposedSrc_.is_valid())
494 transposedSrc_.setStorage(1, _src->getInternalFormat(), cols_->width(), cols_->height());
495 transposedSrc_.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
496 transposedSrc_.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
498 if (!transposedDst_.is_valid())
500 transposedDst_.setStorage(1, _src->getInternalFormat(), cols_->width(), cols_->height());
501 transposedDst_.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
502 transposedDst_.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
506 success = transpose(_dst, &transposedSrc_);
509 if (rows_->debugOutputEnabled())
510 ACG::GLDebug::dumpTexture2D(transposedSrc_.id(), 0, transposedSrc_.getFormat(), transposedSrc_.getType(), rows_->elemSize() * transposedSrc_.getWidth() * transposedSrc_.getHeight(), SAT_DBG_DIR
"2d_rows_tr.bin",
true);
515 if (rows_->profilingEnabled())
516 perfCounter_.restart();
519 success = cols_->execute(&transposedSrc_, &transposedDst_);
521 if (rows_->profilingEnabled())
522 std::cout <<
"cols: " << perfCounter_.
elapsedMs() << std::endl;
525 if (rows_->debugOutputEnabled())
526 ACG::GLDebug::dumpTexture2D(transposedDst_.id(), 0, transposedSrc_.getFormat(), transposedSrc_.getType(), rows_->elemSize() * transposedSrc_.getWidth() * transposedSrc_.getHeight(), SAT_DBG_DIR
"2d_sat_tr.bin",
true);
532 success = transpose(&transposedDst_, _dst);
535 if (rows_->debugOutputEnabled())
536 ACG::GLDebug::dumpTexture2D(_dst->id(), 0, transposedSrc_.getFormat(), transposedSrc_.getType(), rows_->elemSize() * transposedSrc_.getWidth() * transposedSrc_.getHeight(), SAT_DBG_DIR
"2d_sat.bin",
true);
545 void SATPlan::enableDebugOutput()
547 rows_->enableDebugOutput();
548 cols_->enableDebugOutput();
551 void SATPlan::enableProfiling()
553 rows_->enableProfiling();
554 cols_->enableProfiling();
563 bool PrefixSumPlan::testBuffer(
int w,
int cmpMem,
int fullOutput)
569 int numBlocks = numTestVals / blockSize;
570 GLenum internalfmt = GL_R32I;
573 std::vector<int> testData(numTestVals * elemDim);
575 for (
int i = 0; i < numTestVals; ++i)
577 testData[i*elemDim] = i;
579 testData[i*elemDim+1] = i+1;
581 testData[i*elemDim+2] = -i;
583 testData[i*elemDim+3] = i*2;
586 testBuffer.setBufferData(testData.size()*
sizeof(testData[0]), &testData[0], internalfmt);
587 testBufferOut.setBufferData(testData.size()*
sizeof(testData[0]), &testData[0], internalfmt);
590 std::vector<int> expectedData(numTestVals * elemDim, 0);
592 for (
int i = 1; i < numTestVals; ++i)
594 for (
int k = 0; k < elemDim; ++k)
595 expectedData[i*elemDim + k] = testData[(i-1)*elemDim + k] + expectedData[(i-1)*elemDim + k];
603 plan.enableDebugOutput();
604 plan.execute(&testBuffer, &testBufferOut);
611 int bufSize = numTestVals * elemDim * 4;
612 char* pActBuf =
new char[bufSize];
613 testBufferOut.getBufferData(pActBuf);
615 if (memcmp(pActBuf, &expectedData[0], bufSize))
617 printf(
"error for %d\n", w);
623 char szFileOut[0xff];
624 sprintf(szFileOut, SAT_DBG_DIR
"1d_act_%03d.bin", numTestVals);
625 FILE* pFile = fopen(szFileOut,
"wb");
628 fwrite(pActBuf, 1, bufSize, pFile);
632 sprintf(szFileOut, SAT_DBG_DIR
"1d_in_%03d.bin", numTestVals);
633 pFile = fopen(szFileOut,
"wb");
636 fwrite(&testData[0],
sizeof(testData[0]), testData.size(), pFile);
640 sprintf(szFileOut, SAT_DBG_DIR
"1d_exp_%03d.bin", numTestVals);
641 pFile = fopen(szFileOut,
"wb");
644 fwrite(&expectedData[0],
sizeof(expectedData[0]), expectedData.size(), pFile);
648 sprintf(szFileOut, SAT_DBG_DIR
"1d_expbsum_%03d.bin", numTestVals);
649 pFile = fopen(szFileOut,
"wb");
654 for (
int i = 1; i < numBlocks; ++i)
656 memset(bsum, 0,
sizeof(bsum));
658 for (
int e = 0; e < elemDim; ++e)
663 shift = expectedData[((i-1)*blockSize) * elemDim + e];
665 bsum[e] = expectedData[(i*blockSize) * elemDim + e] - shift;
669 fwrite(bsum, 4, elemDim, pFile);
694 bool PrefixSumPlan::test2D(
int w,
int h,
int cmpMem,
int fullOutput )
696 int numTestVals = w*h;
698 int numBlocks = numTestVals / blockSize;
699 GLenum internalfmt = GL_R32I;
701 int elemDim = finfo.channelCount();
703 std::vector<int> testData(numTestVals);
704 for (
int i = 0; i < numTestVals; ++i)
712 std::vector<int> expectedData = testData;
714 PrefixSumPlan::executeRowsCPU(w,h, expectedData);
717 char szFileOut[0xff];
729 PrefixSumPlan::executeColsCPU(w,h,expectedData);
735 testBuffer.setData(0, internalfmt, w, h, finfo.format(), finfo.type(), &testData[0]);
737 testBufferOut.setStorage(1, internalfmt, w, h);
740 testBuffer.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
741 testBuffer.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
743 testBufferOut.bind();
744 testBufferOut.parameter(GL_TEXTURE_MIN_FILTER, GL_NEAREST);
745 testBufferOut.parameter(GL_TEXTURE_MAG_FILTER, GL_NEAREST);
752 SATPlan plan(w,h, internalfmt, blockSize);
755 plan.enableDebugOutput();
756 plan.execute(&testBuffer, &testBufferOut);
762 int bufSize = numTestVals * elemDim * 4;
763 char* pActBuf =
new char[bufSize];
765 ACG::GLDebug::getTextureData2D(testBufferOut.id(), 0, finfo.format(), finfo.type(), bufSize, pActBuf);
768 if (memcmp(pActBuf, &expectedData[0], bufSize))
770 printf(
"error for %dx%d\n", w,h);
779 sprintf(szFileOut, SAT_DBG_DIR
"2d_exp_%dx%d.bin", w,h);
780 FILE* pFile = fopen(szFileOut,
"wb");
783 fwrite(&expectedData[0],
sizeof(expectedData[0]), expectedData.size(), pFile);
788 sprintf(szFileOut, SAT_DBG_DIR
"2d_act_%dx%d.bin", w,h);
789 ACG::GLDebug::dumpTexture2D(testBufferOut.id(), 0, finfo.format(), finfo.type(), numTestVals * elemDim * 4, szFileOut,
true);
793 expectedData = testData;
794 PrefixSumPlan::executeRowsCPU(w,h,expectedData);
795 sprintf(szFileOut, SAT_DBG_DIR
"2d_exp_rows_%dx%d.bin", w,h);
796 pFile = fopen(szFileOut,
"wb");
799 fwrite(&expectedData[0],
sizeof(expectedData[0]), expectedData.size(), pFile);
static ShaderCache * getInstance()
Return instance of the ShaderCache singleton.
GLuint64 elapsedMs()
elapsed gpu time in millisecs
GLSL::Program * getComputeProgram(const char *_computeShaderFile, QStringList *_macros=0, bool _verbose=true)
Query a static compute shader program from cache.
void use()
Enables the program object for using.