29 #ifndef DOXY_IGNORE_THIS
47 _maxEntriesPerRow = 0;
49 rowSizes = NullPointer< int >( );
50 m_ppElements = NullPointer< Pointer( MatrixEntry< T > ) >( );
55 _maxEntriesPerRow = 0;
57 rowSizes = NullPointer< int >( );
58 m_ppElements = NullPointer< Pointer( MatrixEntry< T > ) >( );
64 _maxEntriesPerRow = 0;
66 rowSizes = NullPointer< int >( );
67 m_ppElements = NullPointer< Pointer( MatrixEntry< T > ) >( );
68 Resize( rows , maxEntriesPerRow );
79 _maxEntriesPerRow = 0;
81 rowSizes = NullPointer< int >( );
82 m_ppElements = NullPointer< Pointer( MatrixEntry< T > ) >( );
84 if( M._contiguous ) Resize( M.rows , M._maxEntriesPerRow );
85 else Resize( M.rows );
86 for(
int i=0 ; i<rows ; i++ )
88 SetRowSize( i , M.rowSizes[i] );
97 for(
int i=0 ; i<rows ; i++ ) e +=
int( rowSizes[i] );
103 if( M._contiguous ) Resize( M.rows , M._maxEntriesPerRow );
104 else Resize( M.rows );
105 for(
int i=0 ; i<rows ; i++ )
107 SetRowSize( i , M.rowSizes[i] );
119 FILE* fp = fopen( fileName ,
"wb" );
120 if( !fp )
return false;
121 bool ret = write( fp );
128 FILE* fp = fopen( fileName ,
"rb" );
129 if( !fp )
return false;
130 bool ret = read( fp );
137 if( fwrite( &rows ,
sizeof(
int ) , 1 , fp )!=1 )
return false;
138 if( fwrite( rowSizes ,
sizeof(
int ) , rows , fp )!=rows )
return false;
139 for(
int i=0 ; i<rows ; i++ ) if( fwrite( (*this)[i] , sizeof( MatrixEntry< T > ) , rowSizes[i] , fp )!=rowSizes[i] )
return false;
146 if( fread( &r ,
sizeof(
int ) , 1 , fp )!=1 )
return false;
148 if( fread( rowSizes ,
sizeof(
int ) , rows , fp )!=rows )
return false;
149 for(
int i=0 ; i<rows ; i++ )
154 if( fread( (*
this)[i] ,
sizeof(
MatrixEntry< T > ) , rowSizes[i] , fp )!=rowSizes[i] )
return false;
165 if( _contiguous ){
if( _maxEntriesPerRow ) FreePointer( m_ppElements[0] ); }
166 else for(
int i=0 ; i<rows ; i++ ){
if( rowSizes[i] ) FreePointer( m_ppElements[i] ); }
167 FreePointer( m_ppElements );
168 FreePointer( rowSizes );
173 rowSizes = AllocPointer< int >( r );
174 m_ppElements = AllocPointer< Pointer( MatrixEntry< T > ) >( r );
175 memset( rowSizes , 0 ,
sizeof(
int ) * r );
178 _maxEntriesPerRow = 0;
185 if( _contiguous ){
if( _maxEntriesPerRow ) FreePointer( m_ppElements[0] ); }
186 else for(
int i=0 ; i<rows ; i++ ){
if( rowSizes[i] ) FreePointer( m_ppElements[i] ); }
187 FreePointer( m_ppElements );
188 FreePointer( rowSizes );
193 rowSizes = AllocPointer< int >( r );
194 m_ppElements = AllocPointer< Pointer( MatrixEntry< T > ) >( r );
195 m_ppElements[0] = AllocPointer< MatrixEntry< T > >( r * e );
196 memset( rowSizes , 0 ,
sizeof(
int ) * r );
197 for(
int i=1 ; i<r ; i++ ) m_ppElements[i] = m_ppElements[i-1] + e;
200 _maxEntriesPerRow = e;
208 if( count>_maxEntriesPerRow ) fprintf( stderr ,
"[ERROR] Cannot set row size on contiguous matrix: %d<=%d\n" , count , _maxEntriesPerRow ) , exit( 0 );
209 rowSizes[row] = count;
211 else if( row>=0 && row<rows )
213 if( rowSizes[row] ) FreePointer( m_ppElements[row] );
214 if( count>0 ) m_ppElements[row] = AllocPointer< MatrixEntry< T > >( count );
222 Resize(this->m_N, this->m_M);
229 for(
int ij=0; ij < Min( this->Rows(), this->Columns() ); ij++)
230 (*
this)(ij,ij) = T(1);
244 for (
int i=0; i<this->Rows(); i++)
246 for(
int ii=0;ii<m_ppElements[i].size();i++){m_ppElements[i][ii].Value*=V;}
255 for(
int i=0; i<R.Rows(); i++){
256 for(
int ii=0;ii<m_ppElements[i].size();ii++){
257 int N=m_ppElements[i][ii].N;
258 T Value=m_ppElements[i][ii].Value;
259 for(
int jj=0;jj<M.m_ppElements[N].size();jj++){
260 R(i,M.m_ppElements[N][jj].N) += Value * M.m_ppElements[N][jj].Value;
273 for (
int i=0; i<rows; i++)
276 for(
int ii=0;ii<rowSizes[i];ii++){
277 temp+=m_ppElements[i][ii].Value * V.m_pV[m_ppElements[i][ii].N];
289 #pragma omp parallel for num_threads( threads ) schedule( static )
291 for(
int i=0 ; i<rows ; i++ )
295 for(
int j=0 ; j<rowSizes[i] ; j++ ) temp += m_ppElements[i][j].Value * In.m_pV[m_ppElements[i][j].N];
317 for (
int i=0; i<this->Rows(); i++)
319 for(
int ii=0;ii<m_ppElements[i].size();ii++){
320 M(m_ppElements[i][ii].N,i) = m_ppElements[i][ii].Value;
332 solution.Resize( b.Dimensions() );
336 r.Resize( solution.Dimensions() );
337 M.Multiply( solution , r );
340 double delta_new = 0, delta_0 = 0;
341 for(
int i=0 ; i<r.Dimensions() ; i++ ) delta_new += r.m_pV[i] * r.m_pV[i];
343 if( delta_new<eps )
return 0;
346 q.Resize( d.Dimensions() );
347 for( ii=0; ii<iters && delta_new>eps*delta_0 ; ii++ )
349 M.Multiply( d , q , threads );
350 double dDotQ = 0 , alpha = 0;
351 for(
int i=0 ; i<d.Dimensions() ; i++ ) dDotQ += d.m_pV[i] * q.m_pV[i];
352 alpha = delta_new / dDotQ;
354 #pragma omp parallel
for num_threads( threads ) schedule(
static )
356 for(
int i=0 ; i<r.Dimensions() ; i++ ) solution.m_pV[i] += d.m_pV[i] * T2( alpha );
359 r.Resize( solution.Dimensions() );
360 M.Multiply( solution , r , threads );
365 #pragma omp parallel for num_threads( threads ) schedule( static )
367 for(
int i=0 ; i<r.Dimensions() ; i++ ) r.m_pV[i] = r.m_pV[i] - q.m_pV[i] * T2(alpha);
369 double delta_old = delta_new , beta;
371 for(
int i=0 ; i<r.Dimensions() ; i++ ) delta_new += r.m_pV[i]*r.m_pV[i];
372 beta = delta_new / delta_old;
374 #pragma omp parallel
for num_threads( threads ) schedule(
static )
376 for(
int i=0 ; i<d.Dimensions() ; i++ ) d.m_pV[i] = r.m_pV[i] + d.m_pV[i] * T2( beta );
390 solution.Resize(M.Columns());
395 for(i=0;i<iters && rDotR>eps;i++){
398 alpha=rDotR/d.Dot(Md);
424 for(
int i=0; i<SparseMatrix< T >::rows; i++){
425 for(
int ii=0;ii<SparseMatrix< T >::rowSizes[i];ii++)
440 const T2* in = &In[0];
445 for(
int i=0 ; i<SparseMatrix< T >::rows ; i++ ) dcTerm += in[i];
448 for(
int i=0 ; i<SparseMatrix< T >::rows ; i++ )
452 const T2& in_i_ = in[i];
454 for( ; temp!=end ; temp++ )
463 if( addDCTerm )
for(
int i=0 ; i<SparseMatrix< T >::rows ; i++ ) out[i] += dcTerm;
469 int dim = int( In.Dimensions() );
470 const T2* in = &In[0];
471 int threads = OutScratch.threads();
476 #pragma omp parallel for num_threads( threads ) reduction ( + : dcTerm )
478 for(
int t=0 ; t<threads ; t++ )
480 T2* out = OutScratch[t];
481 memset( out , 0 ,
sizeof( T2 ) * dim );
484 const T2& in_i_ = in[i];
499 dim = int( Out.Dimensions() );
502 #pragma omp parallel for num_threads( threads ) schedule( static )
504 for(
int i=0 ; i<dim ; i++ )
507 for(
int t=0 ; t<threads ; t++ ) _out += OutScratch[t][i];
514 #pragma omp parallel for num_threads( threads )
516 for(
int t=0 ; t<threads ; t++ )
518 T2* out = OutScratch[t];
519 memset( out , 0 ,
sizeof( T2 ) * dim );
536 dim = int( Out.Dimensions() );
539 #pragma omp parallel for num_threads( threads ) schedule( static )
541 for(
int i=0 ; i<dim ; i++ )
544 for(
int t=0 ; t<threads ; t++ ) _out += OutScratch[t][i];
553 int dim = In.Dimensions();
554 const T2* in = &In[0];
555 int threads = OutScratch.size();
557 #pragma omp parallel for num_threads( threads )
559 for(
int t=0 ; t<threads ; t++ )
560 for(
int i=0 ; i<dim ; i++ ) OutScratch[t][i] = T2(0);
562 #pragma omp parallel for num_threads( threads )
564 for(
int t=0 ; t<threads ; t++ )
566 T2* out = OutScratch[t];
567 for(
int i=bounds[t] ; i<bounds[t+1] ; i++ )
570 const MatrixEntry<T>* end = temp + SparseMatrix< T >::rowSizes[i];
571 const T2& in_i_ = in[i];
573 for( ; temp!=end ; temp++ )
584 #pragma omp parallel for num_threads( threads ) schedule( static )
586 for(
int i=0 ; i<Out.Dimensions() ; i++ )
590 for(
int t=0 ; t<threads ; t++ ) _out += OutScratch[t][i];
594 #ifndef _AtomicIncrement_
595 #define _AtomicIncrement_
597 inline void AtomicIncrement(
volatile float* ptr ,
float addend )
599 float newValue = *ptr;
600 LONG& _newValue = *( (LONG*)&newValue );
604 _oldValue = _newValue;
606 _newValue = InterlockedCompareExchange( (LONG*) ptr , _newValue , _oldValue );
607 if( _newValue==_oldValue )
break;
610 inline void AtomicIncrement(
volatile double* ptr ,
double addend )
613 double newValue = *ptr;
614 LONGLONG& _newValue = *( (LONGLONG*)&newValue );
618 _oldValue = _newValue;
620 _newValue = InterlockedCompareExchange64( (LONGLONG*) ptr , _newValue , _oldValue );
622 while( _newValue!=_oldValue );
624 #endif // _AtomicIncrement_
629 const float* in = &In[0];
630 float* out = &Out[0];
633 #pragma omp parallel for num_threads( threads )
635 for(
int t=0 ; t<threads ; t++ )
636 for(
int i=partition[t] ; i<partition[t+1] ; i++ )
640 const float& in_i = in[i];
642 for( ; temp!=end ; temp++ )
645 float v = temp->Value;
647 AtomicIncrement( out+j , v * in_i );
649 AtomicIncrement( out+i , out_i );
653 #pragma omp parallel for num_threads( threads )
655 for(
int i=0 ; i<A.rows ; i++ )
659 const float& in_i = in[i];
661 for( ; temp!=end ; temp++ )
664 float v = temp->Value;
666 AtomicIncrement( out+j , v * in_i );
668 AtomicIncrement( out+i , out_i );
675 const double* in = &In[0];
676 double* out = &Out[0];
680 #pragma omp parallel for num_threads( threads )
682 for(
int t=0 ; t<threads ; t++ )
683 for(
int i=partition[t] ; i<partition[t+1] ; i++ )
687 const double& in_i = in[i];
689 for( ; temp!=end ; temp++ )
694 AtomicIncrement( out+j , v * in_i );
696 AtomicIncrement( out+i , out_i );
700 #pragma omp parallel for num_threads( threads )
702 for(
int i=0 ; i<A.rows ; i++ )
706 const double& in_i = in[i];
708 for( ; temp!=end ; temp++ )
713 AtomicIncrement( out+j , v * in_i );
715 AtomicIncrement( out+i , out_i );
724 int dim = b.Dimensions();
732 if( solveNormal ) temp.Resize( dim );
733 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
734 const T2* _b = &b[0];
736 std::vector< int > partition( threads+1 );
739 for(
int i=0 ; i<A.rows ; i++ ) eCount += A.rowSizes[i];
742 #pragma omp parallel
for num_threads( threads )
744 for(
int t=0 ; t<threads ; t++ )
747 for(
int i=0 ; i<A.rows ; i++ )
749 _eCount += A.rowSizes[i];
750 if( _eCount*threads>=eCount*(t+1) )
757 partition[threads] = A.rows;
761 MultiplyAtomic( A , x , temp , threads , &partition[0] );
762 MultiplyAtomic( A , temp , r , threads , &partition[0] );
763 MultiplyAtomic( A , b , temp , threads , &partition[0] );
765 #pragma omp parallel for num_threads( threads ) schedule( static )
767 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i];
771 MultiplyAtomic( A , x , r , threads , &partition[0] );
773 #pragma omp parallel for num_threads( threads ) schedule( static )
775 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i];
777 double delta_new = 0 , delta_0;
778 for(
size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
782 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
786 for( ii=0; ii<iters && delta_new>eps*delta_0 ; ii++ )
788 if( solveNormal ) MultiplyAtomic( A , d , temp , threads , &partition[0] ) , MultiplyAtomic( A , temp , q , threads , &partition[0] );
789 else MultiplyAtomic( A , d , q , threads , &partition[0] );
791 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
792 T2 alpha = T2( delta_new / dDotQ );
794 #pragma omp parallel for num_threads( threads ) schedule( static )
796 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
797 if( (ii%50)==(50-1) )
800 if( solveNormal ) MultiplyAtomic( A , x , temp , threads , &partition[0] ) , MultiplyAtomic( A , temp , r , threads , &partition[0] );
801 else MultiplyAtomic( A , x , r , threads , &partition[0] );
803 #pragma omp parallel for num_threads( threads ) schedule( static )
805 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i];
809 #pragma omp parallel for num_threads( threads ) schedule( static )
811 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha;
813 double delta_old = delta_new;
815 for(
size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
816 T2 beta = T2( delta_new / delta_old );
818 #pragma omp parallel for num_threads( threads ) schedule( static )
820 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
830 int dim = int( b.Dimensions() );
832 if( reset ) x.Resize( dim );
833 if( solveNormal ) temp.Resize( dim );
834 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
835 const T2* _b = &b[0];
837 double delta_new = 0 , delta_0;
840 A.Multiply( x , temp , scratch , addDCTerm ) , A.Multiply( temp , r , scratch , addDCTerm ) , A.Multiply( b , temp , scratch , addDCTerm );
842 #pragma omp parallel for num_threads( scratch.threads() ) reduction( + : delta_new )
844 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
848 A.Multiply( x , r , scratch , addDCTerm );
850 #pragma omp parallel for num_threads( scratch.threads() ) reduction ( + : delta_new )
852 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
857 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
861 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
863 if( solveNormal ) A.Multiply( d , temp , scratch , addDCTerm ) , A.Multiply( temp , q , scratch , addDCTerm );
864 else A.Multiply( d , q , scratch , addDCTerm );
867 #pragma omp parallel for num_threads( scratch.threads() ) reduction( + : dDotQ )
869 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
870 T2 alpha = T2( delta_new / dDotQ );
871 double delta_old = delta_new;
873 if( (ii%50)==(50-1) )
876 #pragma omp parallel for num_threads( scratch.threads() )
878 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
880 if( solveNormal ) A.Multiply( x , temp , scratch , addDCTerm ) , A.Multiply( temp , r , scratch , addDCTerm );
881 else A.Multiply( x , r , scratch , addDCTerm );
883 #pragma omp parallel for num_threads( scratch.threads() ) reduction( + : delta_new )
885 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
889 #pragma omp parallel for num_threads( scratch.threads() ) reduction( + : delta_new )
891 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
893 T2 beta = T2( delta_new / delta_old );
895 #pragma omp parallel for num_threads( scratch.threads() )
897 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
906 int dim = int( b.Dimensions() );
908 if( threads<1 ) threads = 1;
909 if( threads>1 ) outScratch.resize( threads , dim );
910 if( reset ) x.Resize( dim );
913 if( solveNormal ) temp.Resize( dim );
914 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
915 const T2* _b = &b[0];
917 double delta_new = 0 , delta_0;
921 if( threads>1 ) A.Multiply( x , temp , outScratch , addDCTerm ) , A.Multiply( temp , r , outScratch , addDCTerm ) , A.Multiply( b , temp , outScratch , addDCTerm );
922 else A.Multiply( x , temp , addDCTerm ) , A.Multiply( temp , r , addDCTerm ) , A.Multiply( b , temp , addDCTerm );
924 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
926 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
930 if( threads>1 ) A.Multiply( x , r , outScratch , addDCTerm );
931 else A.Multiply( x , r , addDCTerm );
933 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
935 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
941 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
945 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
949 if( threads>1 ) A.Multiply( d , temp , outScratch , addDCTerm ) , A.Multiply( temp , q , outScratch , addDCTerm );
950 else A.Multiply( d , temp , addDCTerm ) , A.Multiply( temp , q , addDCTerm );
954 if( threads>1 ) A.Multiply( d , q , outScratch , addDCTerm );
955 else A.Multiply( d , q , addDCTerm );
959 #pragma omp parallel for num_threads( threads ) reduction( + : dDotQ )
961 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
962 T2 alpha = T2( delta_new / dDotQ );
963 double delta_old = delta_new;
966 if( (ii%50)==(50-1) )
969 #pragma omp parallel for num_threads( threads )
971 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
975 if( threads>1 ) A.Multiply( x , temp , outScratch , addDCTerm ) , A.Multiply( temp , r , outScratch , addDCTerm );
976 else A.Multiply( x , temp , addDCTerm ) , A.Multiply( temp , r , addDCTerm );
980 if( threads>1 ) A.Multiply( x , r , outScratch , addDCTerm );
981 else A.Multiply( x , r , addDCTerm );
984 #pragma omp parallel for num_threads( threads ) reduction ( + : delta_new )
986 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
991 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new )
993 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
996 T2 beta = T2( delta_new / delta_old );
998 #pragma omp parallel for num_threads( threads )
1000 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
1013 solution.Resize(b.Dimensions());
1017 for(
int i=0 ; i<iters ; i++ )
1019 M.Multiply( solution , Md );
1024 for(
int j=0 ; j<int(M.rows) ; j++ ) solution[j] += (b[j]-Md[j])/diagonal[j];
1029 template<
class T2 >
1033 for(
int i=0 ; i<SparseMatrix< T >::rows ; i++ )