19 #define NO_INCLUDE_ROSENBROCK_CU 29 #if defined HAS_ROCM_MODEL 30 #include <hip/hip_runtime.h> 38 #endif // ifdef __cplusplus 40 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 43 void rbCalcKi (
float **K,
float **,
float *
X,
int* ludI,
44 void (*calcDX)(
float*,
float*,
void*),
45 void *params,
float h,
int N,
int i );
47 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 50 void rbSolver (
float **,
float *,
float *,
int N );
52 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 55 void fludcmp0 (
float **,
int n,
int *indx,
float *);
57 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 61 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 64 void flubksb0 (
float **,
int n,
int *indx,
float *);
66 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 69 void fludcmp (
float **,
int,
int *,
float *);
71 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 74 void flubksb (
float **,
int,
int *,
float * );
98 #if defined __CUDA__ || defined __HIP__ 101 void rbStepX (
float *X,
void (*calcDX)(
float*,
float*,
void*),
102 void (*calcJ)(
float**,
float*,
void*,
int ),
103 void *params,
float h,
int N )
112 const float Chi[
RS_ORDER] = { 14/3., 20/3., 4/3., 2/3. };
115 #if !defined __CUDA__ && !defined __HIP__ 116 printf(
"Increase limit for number of equations: currently %d\n",
RS_MAX_N);
117 #elif defined __CUDA__ 118 printf(
"Error: Increase limit for number of equations (RS_MAX_N macro)\n");
123 memset(kbuf, 0,
sizeof(kbuf));
124 memset(abuf, 0,
sizeof(abuf));
126 for(
int i = 0; i <
RS_ORDER || i < N; i++ ) {
127 if( i <
RS_ORDER ) k[i] = &(kbuf[i*N]);
128 if( i < N ) a[i] = &(abuf[i*N]);
130 calcJ(a, X, params, N);
133 for(
int i = 0; i < N; i++ )
134 for(
int j = 0; j < N; j++ )
135 a[i][j] = ((i == j) ? 4 : 0) - h * a[i][j];
141 for(
int ki = 0; ki <
RS_ORDER; ki++)
142 rbCalcKi ( k, a, X, ludI, calcDX, params, h, N, ki );
144 for(
int Ki = 0; Ki <
RS_ORDER; Ki++)
145 for(
int row = 0; row < N; row++ )
146 X[row] += h * Chi[Ki] * k[Ki][row];
167 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 170 void rbCalcKi (
float **K,
float **A,
float *X,
int* ludI,
171 void (*calcDX)(
float*,
float*,
void*),
172 void *params,
float h,
int N,
int i )
187 memcpy( xx, X, N *
sizeof(
float));
189 for(
int j = 0; j < i; j++ ) {
190 for(
int row = 0; row < N; row++ ) {
191 xx[row] += h * Alpha[i][j] * K[j][row];
192 K[i][row] += Beta[i][j] * K[j][row];
197 calcDX( DXi, xx, params );
199 for(
int row = 0; row < N; row++ ) K[i][row] += DXi[row];
219 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 222 void rbSolver(
float **A,
float *x,
float *b,
int N ) {
226 for(
int i = 1; i < N; i++ ) {
227 for(
int j = 0; j < i; j++ ) {
229 float K = -A[i][j] / A[j][j];
230 for(
int k = j; k < N; k++ ) A[i][k] += K * A[j][k];
236 for(
int i = N - 1; i >= 0; i--) {
238 for(
int j = i + 1; j < N; j++) x[i] -= A[i][j] * x[j];
255 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 258 void fludcmp0(
float **a,
int n,
int *indx,
float *d)
282 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 285 void flubksb0(
float **a,
int n,
int *indx,
float *b)
293 flubksb( ptr, n, indx-1, b-1 );
296 #define TINY 1.0e-20; 298 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 301 void fludcmp(
float **a,
int n,
int *indx,
float *d)
315 if ((temp=fabs(a[i][j])) > big) big=temp;
318 printf(
"Singular matrix in routine LUDCMP");
324 }
while ( ++i <= n );
335 sum -= a[i][k]*a[k][j];
348 sum -= a[i][k]*a[k][j];
352 if ( (dum=vv[i]*fabs(sum)) >= big) {
365 if (a[j][j] == 0.0) a[j][j]=
TINY;
368 for (i=j+1;i<=n;i++) a[i][j] *= dum;
375 #if defined MLIR_CODEGEN && (defined __CUDA__ || defined __HIP__) 378 void flubksb(
float **a,
int n,
int *indx,
float *b )
389 for (j=ii;j<=i-1;j++)
405 #endif // ifdef __cplusplus 407 #if defined MLIR_CODEGEN && !defined __CUDA__ && !defined __HIP__ 425 template<
size_t vector_size>
426 void rbCalcKi(
float **K,
float **A,
float *X,
int* ludI,
427 void (*calcDX)(
float*,
float*,
void*),
428 void *params,
float h,
int N,
int i)
443 memcpy(xx, X, N*vector_size*
sizeof(
float));
445 for (
int v = 0; v < vector_size; ++v) {
446 for(
int j = 0; j < i; j++ ) {
447 for(
int row = 0; row < N; row++ ) {
448 xx[v*N + row] += h * Alpha[i][j] * K[v*
RS_ORDER + j][row];
455 calcDX( DXi, xx, params );
456 for (
int v = 0; v < vector_size; ++v)
457 for(
int row = 0; row < N; row++ )
458 K[v*
RS_ORDER + i][row] += DXi[v*N + row];
460 for (
int v = 0; v < vector_size; ++v)
485 template<
size_t vector_size>
486 void rbStepX (
float *X,
void (*calcDX)(
float*,
float*,
void*),
487 void (*calcJ)(
float*,
float*,
void*,
int ),
488 void *params,
float h,
int N )
495 const float Chi[
RS_ORDER] = { 14/3., 20/3., 4/3., 2/3. };
498 printf(
"Increase limit for number of equations: currently %d\n",
RS_MAX_N );
502 memset(kbuf, 0,
sizeof(kbuf));
503 memset(abuf, 0,
sizeof(abuf));
506 for (
int i = 0; i < vector_size; ++i) {
507 for(
int j = 0; j <
RS_ORDER || j < N; j++ ) {
508 if( j <
RS_ORDER ) k[i*
RS_ORDER + j] = &(kbuf[i*K_ORDER*K_ORDER + j*K_ORDER]);
509 if( j < N ) a[i*N + j] = &(abuf[i*N*N + j*N]);
512 calcJ(abuf, X, params, N);
514 for (
int i = 0; i < vector_size; ++i)
515 for(
int j = 0; j < N; j++ )
516 for(
int k = 0; k < N; k++ )
517 a[i*N + j][k] = ((j == k) ? 4 : 0) - h * a[i*N + j][k];
520 for (
int i = 0; i < vector_size; ++i) {
522 fludcmp0( &a[i*N], N, ludI + i * N, &ludD );
525 for(
int ki = 0; ki <
RS_ORDER; ki++)
526 rbCalcKi<vector_size>( k, a, X, ludI, calcDX, params, h, N, ki);
528 for (
int i = 0; i < vector_size; ++i)
529 for(
int Ki = 0; Ki <
RS_ORDER; Ki++)
530 for(
int row = 0; row < N; row++ )
531 X[i*N + row] += h * Chi[Ki] * k[i * RS_ORDER + Ki][row];
537 #endif // ifdef __cplusplus 539 void rbStepX_2 (
float *X,
void (*calcDX)(
float*,
float*,
void*),
540 void (*calcJ)(
float*,
float*,
void*,
int ),
541 void *params,
float h,
int N )
543 rbStepX<2>(
X, calcDX, calcJ, params, h, N);
546 void rbStepX_4 (
float *X,
void (*calcDX)(
float*,
float*,
void*),
547 void (*calcJ)(
float*,
float*,
void*,
int ),
548 void *params,
float h,
int N )
550 rbStepX<4>(
X, calcDX, calcJ, params, h, N);
553 void rbStepX_8 (
float *X,
void (*calcDX)(
float*,
float*,
void*),
554 void (*calcJ)(
float*,
float*,
void*,
int ),
555 void *params,
float h,
int N )
557 rbStepX<8>(
X, calcDX, calcJ, params, h, N);
560 void rbStepX_16 (
float *X,
void (*calcDX)(
float*,
float*,
void*),
561 void (*calcJ)(
float*,
float*,
void*,
int ),
562 void *params,
float h,
int N )
564 rbStepX<16>(
X, calcDX, calcJ, params, h, N);
567 void rbStepX_32 (
float *X,
void (*calcDX)(
float*,
float*,
void*),
568 void (*calcJ)(
float*,
float*,
void*,
int ),
569 void *params,
float h,
int N )
571 rbStepX<32>(
X, calcDX, calcJ, params, h, N);
576 #endif // ifdef __cplusplus
void flubksb(float **, int, int *, float *)
void fludcmp(float **, int, int *, float *)
void flubksb0(float **, int n, int *indx, float *)
void rbStepX(float *X, void(*calcDX)(float *, float *, void *), void(*calcJ)(float **, float *, void *, int), void *params, float h, int N)
T sum(const vector< T > &vec)
Compute sum of a vector's entries.
void rbSolver(float **, float *, float *, int N)
void rbCalcKi(float **K, float **, float *X, int *ludI, void(*calcDX)(float *, float *, void *), void *params, float h, int N, int i)
void fludcmp0(float **, int n, int *indx, float *)