▶ 《并行程序设计导论》第六章中讨论了 n 体问题,分别使用了 MPI,Pthreads,OpenMP 来进行实现,这里是 MPI 的代码,分为基本算法和简化算法(引力计算量为基本算法的一半,但是消息传递较为复杂)
● 基本算法
1 //mpi_nbody_basic.c,MPI 基本算法
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <math.h>
6 #include <mpi.h>
7
8 #define OUTPUT // 要求输出结果
9 #define DEBUG // debug 模式,每个函数给出更多的输出
10 #define DIM 2 // 二维系统
11 #define X 0 // X 坐标
12 #define Y 1 // Y 坐标
13 typedef double vect_t[DIM]; // 向量数据类型
14 const double G = 6.673e-11; // 万有引力常量
15 int my_rank, comm_sz; // 进程编号和总进程数
16 MPI_Datatype vect_mpi_t; // 使用的派生数据类型
17 vect_t *vel = NULL; // 全局颗粒速度,用于 0 号进程的输出
18
19 void Usage(char* prog_name)// 输入说明
20 {
21 fprintf(stderr, "usage: mpiexec -n <nProcesses> %s\n", prog_name);
22 fprintf(stderr, "<nParticle> <nTimestep> <sizeTimestep> <outputFrequency> <g|i>\n");
23 fprintf(stderr, " 'g': inite condition by random\n");
24 fprintf(stderr, " 'i': inite condition from stdin\n");
25 exit(0);
26 }
27
28 void Get_args(int argc, char* argv[], int* n_p, int* n_steps_p, double* delta_t_p, int* output_freq_p, char* g_i_p)// 获取参数信息
29 { // 所有进程均调用该函数,因为有集合通信,但只有 0 号进程处理参数
30 if (my_rank == 0)
31 {
32 if (argc != 6)
33 Usage(argv[0]);
34 *n_p = strtol(argv[1], NULL, 10);
35 *n_steps_p = strtol(argv[2], NULL, 10);
36 *delta_t_p = strtod(argv[3], NULL);
37 *output_freq_p = strtol(argv[4], NULL, 10);
38 *g_i_p = argv[5][0];
39 if (*n_p <= 0 || *n_p % comm_sz || *n_steps_p < 0 || *delta_t_p <= 0 || *g_i_p != 'g' && *g_i_p != 'i')// 不合要求的输入情况
40 {
41 printf("haha\n");
42 if (my_rank == 0)
43 Usage(argv[0]);
44 MPI_Finalize();
45 exit(0);
46 }
47 }
48 MPI_Bcast(n_p, 1, MPI_INT, 0, MPI_COMM_WORLD);
49 MPI_Bcast(n_steps_p, 1, MPI_INT, 0, MPI_COMM_WORLD);
50 MPI_Bcast(delta_t_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
51 MPI_Bcast(output_freq_p, 1, MPI_INT, 0, MPI_COMM_WORLD);
52 MPI_Bcast(g_i_p, 1, MPI_CHAR, 0, MPI_COMM_WORLD);
53 # ifdef DEBUG// 确认各进程中的参数情况
54 printf("Get_args, rank%2d n %d n_steps %d delta_t %e output_freq %d g_i %c\n",
55 my_rank, *n_p, *n_steps_p, *delta_t_p, *output_freq_p, *g_i_p);
56 fflush(stdout);
57 # endif
58 }
59
60 void Gen_init_cond(double masses[], vect_t pos[], vect_t loc_vel[], int n, int loc_n)// 自动生成初始条件,所有进程均调用该函数,因为有集合通信
61 { // 生成的颗粒位于原点和 X 正半轴,速度大小相等,方向平行于 Y 轴,交错向上下
62 const double mass = 5.0e24, gap = 1.0e5, speed = 3.0e4;// 使用了地球的质量和公转速度
63 if (my_rank == 0)
64 {
65 // srand(2);// 使用随机方向和速度大小,下同
66 for (int i = 0; i < n; i++)
67 {
68 masses[i] = mass;
69 pos[i][X] = i * gap;
70 pos[i][Y] = 0.0;
71 vel[i][X] = 0.0;
72 // vel[i][Y] = speed * (2 * rand() / (double)RAND_MAX) - 1);
73 vel[i][Y] = (i % 2) ? -speed : speed;
74 }
75 }
76 // 同步质量,位置信息,分发速度信息
77 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
78 MPI_Bcast(pos, n, vect_mpi_t, 0, MPI_COMM_WORLD);
79 MPI_Scatter(vel, loc_n, vect_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);
80 # ifdef DEBUG// 确认各进程中第一个颗粒的初始条件
81 printf("Gen_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n",
82 my_rank, masses[0], pos[0][X], pos[0][Y], loc_vel[0][X], loc_vel[0][Y]);
83 fflush(stdout);
84 # endif
85 }
86
87 void Get_init_cond(double masses[], vect_t pos[], vect_t loc_vel[], int n, int loc_n)// 手工输入初始条件,类似函数 Gen_init_cond()
88 {
89 if (my_rank == 0)
90 {
91 printf("For each particle, enter (in order): mass x-coord y-coord x-velocity y-velocity\n");
92 for (int i = 0; i < n; i++)
93 {
94 scanf_s("%lf", &masses[i]);
95 scanf_s("%lf", &pos[i][X]);
96 scanf_s("%lf", &pos[i][Y]);
97 scanf_s("%lf", &vel[i][X]);
98 scanf_s("%lf", &vel[i][Y]);
99 }
100 }
101 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
102 MPI_Bcast(pos, n, vect_mpi_t, 0, MPI_COMM_WORLD);
103 MPI_Scatter(vel, loc_n, vect_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);
104 # ifdef DEBUG
105 printf("Get_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n",
106 my_rank, masses[0], pos[0][X], pos[0][Y], loc_vel[0][X], loc_vel[0][Y]);
107 fflush(stdout);
108 # endif
109 }
110
111 void Output_state(double time, double masses[], vect_t pos[], vect_t loc_vel[], int n, int loc_n)// 输出当前状态
112 {
113 MPI_Gather(loc_vel, loc_n, vect_mpi_t, vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);// 从各进程聚集速度信息用于输出
114 if (my_rank == 0)
115 {
116 printf("Output_state, time = %.2f\n", time);
117 for (int i = 0; i < n; i++)
118 printf(" %2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", i, masses[i], pos[i][X], pos[i][Y], vel[i][X], vel[i][Y]);
119 printf("\n");
120 fflush(stdout);
121 }
122 }
123
124 void Compute_force(int loc_part, double masses[], vect_t loc_forces[], vect_t pos[], int n, int loc_n)// 计算颗粒 part 受到的万有引力
125 {
126 const int part = my_rank * loc_n + loc_part;// 将局部颗粒编号转化为全局颗粒编号
127 int k;
128 vect_t f_part_k;
129 double len, fact;
130 for (loc_forces[loc_part][X] = loc_forces[loc_part][Y] = 0.0, k = 0; k < n; k++)
131 {
132 if (k != part)
133 {
134 f_part_k[X] = pos[part][X] - pos[k][X];
135 f_part_k[Y] = pos[part][Y] - pos[k][Y];
136 len = sqrt(f_part_k[X] * f_part_k[X] + f_part_k[Y] * f_part_k[Y]);
137 fact = -G * masses[part] * masses[k] / (len * len * len);
138 f_part_k[X] *= fact;
139 f_part_k[Y] *= fact;
140 loc_forces[loc_part][X] += f_part_k[X];
141 loc_forces[loc_part][Y] += f_part_k[Y];
142 # ifdef DEBUG// 确认计算结果
143 printf("Compute_force, rank%2d k%2d> %10.3e %10.3e %10.3e %10.3e\n", my_rank, k, len, fact, f_part_k[X], f_part_k[Y]);
144 fflush(stdout);// 函数 printf() 中引用 vel 会导致非 0 号进程中断退出,吃了大亏
145 # endif
146 }
147 }
148 }
149
150 void Update_part(int loc_part, double masses[], vect_t loc_forces[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n, double delta_t)// 更新颗粒位置
151 {
152 const int part = my_rank * loc_n + loc_part;
153 const double fact = delta_t / masses[part];
154 # ifdef DEBUG// 输出在更新前和更新后的数据
155 printf("Update_part before, part%2d %10.3e %10.3e %10.3e %10.3e %10.3e %10.3e\n",
156 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y], loc_forces[loc_part][X], loc_forces[loc_part][Y]);
157 fflush(stdout);
158 # endif
159 loc_pos[loc_part][X] += delta_t * loc_vel[loc_part][X];
160 loc_pos[loc_part][Y] += delta_t * loc_vel[loc_part][Y];
161 loc_vel[loc_part][X] += fact * loc_forces[loc_part][X];
162 loc_vel[loc_part][Y] += fact * loc_forces[loc_part][Y];
163 # ifdef DEBUG
164 printf("Update_part after, part%2d %10.3e %10.3e %10.3e %10.3e\n",
165 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y]);
166 fflush(stdout);
167 # endif
168 }
169
170 int main(int argc, char* argv[])
171 {
172 int n, loc_n, loc_part; // 颗粒数,每进程颗粒数,当前颗粒(循环变量)
173 int n_steps, step; // 计算时间步数,当前时间片(循环变量)
174 double delta_t; // 计算时间步长
175 int output_freq; // 数据输出频率
176 double *masses; // 颗粒质量,每个进程都有,一经初始化和同步就不再改变
177 vect_t *pos, *loc_pos; // 颗粒位置,每个时间片计算完成后需要同步
178 vect_t *loc_vel; // 颗粒速度,由各进程分开保存,不到输出时不用同步
179 vect_t *loc_forces; // 各进程的颗粒所受引力
180 char g_i; // 初始条件选项,g 为自动生成,i 为手工输入
181 double start, finish; // 计时器
182
183 MPI_Init(&argc, &argv);
184 MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
185 MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
186 MPI_Type_contiguous(DIM, MPI_DOUBLE, &vect_mpi_t);// 提交需要的派生类型
187 MPI_Type_commit(&vect_mpi_t);
188
189 // 获取参数,初始化数组
190 Get_args(argc, argv, &n, &n_steps, &delta_t, &output_freq, &g_i);
191 loc_n = n / comm_sz; // 要求 n % comm_sz == 0
192 masses = (double*)malloc(n * sizeof(double));
193 pos = (vect_t*)malloc(n * sizeof(vect_t));
194 loc_pos = pos + my_rank * loc_n;
195 loc_vel = (vect_t*)malloc(loc_n * sizeof(vect_t));
196 loc_forces = (vect_t*)malloc(loc_n * sizeof(vect_t));
197 if (my_rank == 0)
198 vel = (vect_t*)malloc(n * sizeof(vect_t));
199 if (g_i == 'g')
200 Gen_init_cond(masses, pos, loc_vel, n, loc_n);
201 else
202 Get_init_cond(masses, pos, loc_vel, n, loc_n);
203
204 // 开始计算并计时
205 if (my_rank == 0)
206 start = MPI_Wtime();
207 # ifdef OUTPUT// 输出出初始态
208 Output_state(0.0, masses, pos, loc_vel, n, loc_n);
209 # endif
210 for (step = 1; step <= n_steps; step++)
211 {
212 // 计算每颗粒受力,更新颗粒状态,然后同步颗粒位置
213 for (loc_part = 0; loc_part < loc_n; Compute_force(loc_part++, masses, loc_forces, pos, n, loc_n));
214 for (loc_part = 0; loc_part < loc_n; Update_part(loc_part++, masses, loc_forces, loc_pos, loc_vel, n, loc_n, delta_t));
215 MPI_Allgather(MPI_IN_PLACE, loc_n, vect_mpi_t, pos, loc_n, vect_mpi_t, MPI_COMM_WORLD);
216 # ifdef OUTPUT// 每隔一个输出时间间隔就输出一次
217 if (step % output_freq == 0)
218 Output_state(step * delta_t, masses, pos, loc_vel, n, loc_n);
219 # endif
220 }
221 // 报告计时,释放资源
222 if (my_rank == 0)
223 {
224 finish = MPI_Wtime();
225 printf("Elapsed time = %e ms\n", (finish - start) * 1000);
226 free(vel);
227 }
228 MPI_Type_free(&vect_mpi_t);
229 free(masses);
230 free(pos);
231 free(loc_vel);
232 free(loc_forces);
233 MPI_Finalize();
234 return 0;
235 }
● 输出结果。8 进程 16 体,3 秒,时间步长 1 秒,舍去 debug 输出 1.264884e+00 ms;8 进程 1024 体,3600 秒,时间步长 1 秒,舍去 output 和 debug 输出 1.328894e+04 ms
D:\Code\MPI\MPIProjectTemp\x64\Debug>mpiexec -n 8 MPIProjectTemp.exe 16 3 1 1 g
Output_state, time = 0.00
0 5.000e+24 0.000e+00 0.000e+00 0.000e+00 3.000e+04
1 5.000e+24 1.000e+05 0.000e+00 0.000e+00 -3.000e+04
2 5.000e+24 2.000e+05 0.000e+00 0.000e+00 3.000e+04
3 5.000e+24 3.000e+05 0.000e+00 0.000e+00 -3.000e+04
4 5.000e+24 4.000e+05 0.000e+00 0.000e+00 3.000e+04
5 5.000e+24 5.000e+05 0.000e+00 0.000e+00 -3.000e+04
6 5.000e+24 6.000e+05 0.000e+00 0.000e+00 3.000e+04
7 5.000e+24 7.000e+05 0.000e+00 0.000e+00 -3.000e+04
8 5.000e+24 8.000e+05 0.000e+00 0.000e+00 3.000e+04
9 5.000e+24 9.000e+05 0.000e+00 0.000e+00 -3.000e+04
10 5.000e+24 1.000e+06 0.000e+00 0.000e+00 3.000e+04
11 5.000e+24 1.100e+06 0.000e+00 0.000e+00 -3.000e+04
12 5.000e+24 1.200e+06 0.000e+00 0.000e+00 3.000e+04
13 5.000e+24 1.300e+06 0.000e+00 0.000e+00 -3.000e+04
14 5.000e+24 1.400e+06 0.000e+00 0.000e+00 3.000e+04
15 5.000e+24 1.500e+06 0.000e+00 0.000e+00 -3.000e+04
Output_state, time = 1.00
0 5.000e+24 0.000e+00 3.000e+04 5.273e+04 3.000e+04
1 5.000e+24 1.000e+05 -3.000e+04 1.922e+04 -3.000e+04
2 5.000e+24 2.000e+05 3.000e+04 1.071e+04 3.000e+04
3 5.000e+24 3.000e+05 -3.000e+04 6.802e+03 -3.000e+04
4 5.000e+24 4.000e+05 3.000e+04 4.485e+03 3.000e+04
5 5.000e+24 5.000e+05 -3.000e+04 2.875e+03 -3.000e+04
6 5.000e+24 6.000e+05 3.000e+04 1.614e+03 3.000e+04
7 5.000e+24 7.000e+05 -3.000e+04 5.213e+02 -3.000e+04
8 5.000e+24 8.000e+05 3.000e+04 -5.213e+02 3.000e+04
9 5.000e+24 9.000e+05 -3.000e+04 -1.614e+03 -3.000e+04
10 5.000e+24 1.000e+06 3.000e+04 -2.875e+03 3.000e+04
11 5.000e+24 1.100e+06 -3.000e+04 -4.485e+03 -3.000e+04
12 5.000e+24 1.200e+06 3.000e+04 -6.802e+03 3.000e+04
13 5.000e+24 1.300e+06 -3.000e+04 -1.071e+04 -3.000e+04
14 5.000e+24 1.400e+06 3.000e+04 -1.922e+04 3.000e+04
15 5.000e+24 1.500e+06 -3.000e+04 -5.273e+04 -3.000e+04
Output_state, time = 2.00
0 5.000e+24 5.273e+04 6.000e+04 9.288e+04 1.641e+04
1 5.000e+24 1.192e+05 -6.000e+04 3.818e+04 -3.791e+03
2 5.000e+24 2.107e+05 6.000e+04 2.116e+04 3.791e+03
3 5.000e+24 3.068e+05 -6.000e+04 1.356e+04 -3.101e+03
4 5.000e+24 4.045e+05 6.000e+04 8.930e+03 3.101e+03
5 5.000e+24 5.029e+05 -6.000e+04 5.739e+03 -2.959e+03
6 5.000e+24 6.016e+05 6.000e+04 3.218e+03 2.959e+03
7 5.000e+24 7.005e+05 -6.000e+04 1.043e+03 -2.929e+03
8 5.000e+24 7.995e+05 6.000e+04 -1.043e+03 2.929e+03
9 5.000e+24 8.984e+05 -6.000e+04 -3.218e+03 -2.959e+03
10 5.000e+24 9.971e+05 6.000e+04 -5.739e+03 2.959e+03
11 5.000e+24 1.096e+06 -6.000e+04 -8.930e+03 -3.101e+03
12 5.000e+24 1.193e+06 6.000e+04 -1.356e+04 3.101e+03
13 5.000e+24 1.289e+06 -6.000e+04 -2.116e+04 -3.791e+03
14 5.000e+24 1.381e+06 6.000e+04 -3.818e+04 3.791e+03
15 5.000e+24 1.447e+06 -6.000e+04 -9.288e+04 -1.641e+04
Output_state, time = 3.00
0 5.000e+24 1.456e+05 7.641e+04 1.273e+05 -1.575e+03
1 5.000e+24 1.574e+05 -6.379e+04 5.867e+04 2.528e+04
2 5.000e+24 2.319e+05 6.379e+04 2.685e+04 -2.069e+04
3 5.000e+24 3.204e+05 -6.310e+04 1.884e+04 2.228e+04
4 5.000e+24 4.134e+05 6.310e+04 1.235e+04 -2.151e+04
5 5.000e+24 5.086e+05 -6.296e+04 8.111e+03 2.179e+04
6 5.000e+24 6.048e+05 6.296e+04 4.537e+03 -2.163e+04
7 5.000e+24 7.016e+05 -6.293e+04 1.493e+03 2.169e+04
8 5.000e+24 7.984e+05 6.293e+04 -1.493e+03 -2.169e+04
9 5.000e+24 8.952e+05 -6.296e+04 -4.537e+03 2.163e+04
10 5.000e+24 9.914e+05 6.296e+04 -8.111e+03 -2.179e+04
11 5.000e+24 1.087e+06 -6.310e+04 -1.235e+04 2.151e+04
12 5.000e+24 1.180e+06 6.310e+04 -1.884e+04 -2.228e+04
13 5.000e+24 1.268e+06 -6.379e+04 -2.685e+04 2.069e+04
14 5.000e+24 1.343e+06 6.379e+04 -5.867e+04 -2.528e+04
15 5.000e+24 1.354e+06 -7.641e+04 -1.273e+05 1.575e+03
Elapsed time = 1.264884e+00 ms
● 简化算法
1 // mpi_nbody_red.c,MPI 简化算法,与 mppi_nbody_basic.c 相同的地方去掉了注释
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <math.h>
6 #include <mpi.h>
7
8 #define OUTPUT
9 #define DEBUG
10 #define DIM 2
11 #define X 0
12 #define Y 1
13 typedef double vect_t[DIM];
14 const double G = 6.673e-11;
15 int my_rank, comm_sz;
16 MPI_Datatype vect_mpi_t;
17 MPI_Datatype cyclic_mpi_t; // 环形传输的数据类型
18 vect_t *vel = NULL;
19 vect_t *pos = NULL; // 位置信息变成了全局变量
20
21 void Usage(char* prog_name)
22 {
23 fprintf(stderr, "usage: mpiexec -n <nProcesses> %s\n", prog_name);
24 fprintf(stderr, "<nParticle> <nTimestep> <sizeTimestep> <outputFrequency> <g|i>\n");
25 fprintf(stderr, " 'g': inite condition by random\n");
26 fprintf(stderr, " 'i': inite condition from stdin\n");
27 exit(0);
28 }
29
30 void Get_args(int argc, char* argv[], int* n_p, int* n_steps_p, double* delta_t_p, int* output_freq_p, char* g_i_p)
31 {
32 if (my_rank == 0)
33 {
34 if (argc != 6)
35 Usage(argv[0]);
36 *n_p = strtol(argv[1], NULL, 10);
37 *n_steps_p = strtol(argv[2], NULL, 10);
38 *delta_t_p = strtod(argv[3], NULL);
39 *output_freq_p = strtol(argv[4], NULL, 10);
40 *g_i_p = argv[5][0];
41 if (*n_p <= 0 || *n_p % comm_sz || *n_steps_p < 0 || *delta_t_p <= 0 || *g_i_p != 'g' && *g_i_p != 'i')
42 {
43 printf("haha\n");
44 if (my_rank == 0)
45 Usage(argv[0]);
46 MPI_Finalize();
47 exit(0);
48 }
49 }
50 MPI_Bcast(n_p, 1, MPI_INT, 0, MPI_COMM_WORLD);
51 MPI_Bcast(n_steps_p, 1, MPI_INT, 0, MPI_COMM_WORLD);
52 MPI_Bcast(delta_t_p, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
53 MPI_Bcast(output_freq_p, 1, MPI_INT, 0, MPI_COMM_WORLD);
54 MPI_Bcast(g_i_p, 1, MPI_CHAR, 0, MPI_COMM_WORLD);
55 # ifdef DEBUG
56 printf("Get_args, rank%2d n %d n_steps %d delta_t %e output_freq %d g_i %c\n",
57 my_rank, *n_p, *n_steps_p, *delta_t_p, *output_freq_p, *g_i_p);
58 fflush(stdout);
59 # endif
60 }
61
62 void Build_cyclic_mpi_type(int loc_n)// 生成大小为 loc_n 的循环分配数据类型
63 {
64 MPI_Datatype temp_mpi_t;
65 MPI_Aint lb, extent;
66 MPI_Type_vector(loc_n, 1, comm_sz, vect_mpi_t, &temp_mpi_t);// 将跨度为 comm_sz 的 loc_n 个 “连续 2 个 double” 封装为 temp_mpi_t
67 MPI_Type_get_extent(vect_mpi_t, &lb, &extent);
68 MPI_Type_create_resized(temp_mpi_t, lb, extent, &cyclic_mpi_t);
69 MPI_Type_commit(&cyclic_mpi_t);
70 }
71
72 void Gen_init_cond(double masses[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n)
73 {
74 const double mass = 5.0e24, gap = 1.0e5, speed = 3.0e4;
75 if (my_rank == 0)
76 {
77 // srand(2);
78 for (int i = 0; i < n; i++)
79 {
80 masses[i] = mass;
81 pos[i][X] = i * gap;
82 pos[i][Y] = 0.0;
83 vel[i][X] = 0.0;
84 // vel[i][Y] = speed * (2 * rand() / (double)RAND_MAX) - 1);
85 vel[i][Y] = (i % 2) ? -speed : speed;
86 }
87 }
88 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
89 MPI_Scatter(pos, 1, cyclic_mpi_t, loc_pos, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);// loc_pos 和 loc_vel 需要分别分发
90 MPI_Scatter(vel, 1, cyclic_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);
91 # ifdef DEBUG
92 printf("Gen_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n",
93 my_rank, masses[0], loc_pos[0][X], loc_pos[0][Y], loc_vel[0][X], loc_vel[0][Y]);
94 fflush(stdout);
95 # endif
96 }
97
98 void Get_init_cond(double masses[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n)
99 {
100 if (my_rank == 0)
101 {
102 printf("For each particle, enter (in order): mass x-coord y-coord x-velocity y-velocity\n");
103 for (int i = 0; i < n; i++)
104 {
105 scanf_s("%lf", &masses[i]);
106 scanf_s("%lf", &pos[i][X]);
107 scanf_s("%lf", &pos[i][Y]);
108 scanf_s("%lf", &vel[i][X]);
109 scanf_s("%lf", &vel[i][Y]);
110 }
111 }
112 MPI_Bcast(masses, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
113 MPI_Scatter(pos, 1, cyclic_mpi_t, loc_pos, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);// loc_pos 和 loc_vel 需要分别分发
114 MPI_Scatter(vel, 1, cyclic_mpi_t, loc_vel, loc_n, vect_mpi_t, 0, MPI_COMM_WORLD);
115 # ifdef DEBUG
116 printf("Get_init_cond, rank%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n",
117 my_rank, masses[0], loc_pos[0][X], loc_pos[0][Y], loc_vel[0][X], loc_vel[0][Y]);
118 fflush(stdout);
119 # endif
120 }
121
122 void Output_state(double time, double masses[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n)
123 {
124 MPI_Gather(loc_pos, loc_n, vect_mpi_t, pos, 1, cyclic_mpi_t, 0, MPI_COMM_WORLD);// loc_pos 和 loc_vel 需要分别聚集
125 MPI_Gather(loc_vel, loc_n, vect_mpi_t, vel, 1, cyclic_mpi_t, 0, MPI_COMM_WORLD);
126 if (my_rank == 0)
127 {
128 printf("Output_state, time = %.2f\n", time);
129 for (int i = 0; i < n; i++)
130 printf(" %2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", i, masses[i], pos[i][X], pos[i][Y], vel[i][X], vel[i][Y]);
131 printf("\n");
132 fflush(stdout);
133 }
134 }
135
136 int First_index(int gbl1, int rk1, int rk2, int proc_count)
137 {
138 return gbl1 + (rk2 - rk1) + (rk1 < rk2 ? 0 : proc_count);
139 }
140
141 int Local_to_global(int loc_part, int proc_rk, int proc_count)// 进程局部编号转全局编号
142 {
143 return loc_part * proc_count + proc_rk;
144 }
145
146 int Global_to_local(int gbl_part, int proc_rk, int proc_count)// 全局编号转进程局部编号
147 {
148 return (gbl_part - proc_rk) / proc_count;
149 }
150
151 void Compute_force_pair(double m1, double m2, vect_t pos1, vect_t pos2, vect_t force1, vect_t force2)// 计算两个颗粒之间的引力
152 {
153 const double mg = -G * m1 * m2;
154 vect_t f_part_k;
155 double len, fact;
156
157 f_part_k[X] = pos1[X] - pos2[X];
158 f_part_k[Y] = pos1[Y] - pos2[Y];
159 len = sqrt(f_part_k[X] * f_part_k[X] + f_part_k[Y] * f_part_k[Y]);
160 fact = mg / (len * len * len);
161 f_part_k[X] *= fact;
162 f_part_k[Y] *= fact;
163 force1[X] += f_part_k[X];
164 force1[Y] += f_part_k[Y];
165 force2[X] -= f_part_k[X];
166 force2[Y] -= f_part_k[Y];
167 }
168
169 void Compute_proc_forces(double masses[], vect_t tmp_data[], vect_t loc_forces[], vect_t pos1[], int loc_n1, int rk1, int loc_n2, int rk2, int n, int p)
170 { // 计算 pos1 与 tmp_data 中满足下标条件的颗粒之间的引力
171 //masses 颗粒质量表
172 //tmp_data 环形传输数据
173 //loc_forces 本进程颗粒受力数据
174 //pos1 本进程颗粒位置数据
175 //loc_n1 pos1 中颗粒数量
176 //rk1 pos1 中 process owning particles
177 //loc_n2 temp_data 中颗粒数量
178 int gbl_part1, gbl_part2, loc_part1, loc_part2; //rk2 tmp_data 中 process owning contributed particles
179 for (gbl_part1 = rk1, loc_part1 = 0; loc_part1 < loc_n1; loc_part1++, gbl_part1 += p) //n 总颗粒数
180 { //p 参与计算的进程数
181 for (gbl_part2 = First_index(gbl_part1, rk1, rk2, p), loc_part2 = Global_to_local(gbl_part2, rk2, p); loc_part2 < loc_n2; loc_part2++, gbl_part2 += p)
182 {
183 # ifdef DEBUG
184 printf("Compute_proc_forces before, rank%2d> part%2d %10.3e %10.3e part%2d %10.3e %10.3e\n",
185 my_rank, gbl_part1, loc_forces[loc_part1][X], loc_forces[loc_part1][Y], gbl_part2, tmp_data[loc_n2 + loc_part2][X], tmp_data[loc_n2 + loc_part2][Y]);
186 # endif
187 Compute_force_pair(masses[gbl_part1], masses[gbl_part2], pos1[loc_part1], tmp_data[loc_part2], loc_forces[loc_part1], tmp_data[loc_n2 + loc_part2]);
188 # ifdef DEBUG
189 printf("Compute_proc_forces before, rank%2d> part%2d %10.3e %10.3e part%2d %10.3e %10.3e\n",
190 my_rank, gbl_part1, loc_forces[loc_part1][X], loc_forces[loc_part1][Y], gbl_part2, tmp_data[loc_n2 + loc_part2][X], tmp_data[loc_n2 + loc_part2][Y]);
191 # endif
192 }
193 }
194 }
195
196 void Compute_forces(double masses[], vect_t tmp_data[], vect_t loc_forces[], vect_t loc_pos[], int n, int loc_n)// 计算本线程各颗粒受到的引力
197 { // masses 颗粒质量表
198 const int src = (my_rank + 1) % comm_sz, dest = (my_rank - 1 + comm_sz) % comm_sz; // tmp_data 环形传输数据
199 int i, other_proc, loc_part; // loc_forces 本进程颗粒受力数据
200 // loc_pos 本进程颗粒位置数据
201 memcpy(tmp_data, loc_pos, loc_n * sizeof(vect_t)); // 将本进程分到的位置数据放入 tmp_data // n 总颗粒数
202 memset(tmp_data + loc_n, 0, loc_n * sizeof(vect_t));// 初始化 tmp_data 的引力数据 // loc_n 本进程分到的颗粒数
203 memset(loc_forces, 0, loc_n * sizeof(vect_t)); // 初始化本进程引力数据
204
205 Compute_proc_forces(masses, tmp_data, loc_forces, loc_pos, loc_n, my_rank, loc_n, my_rank, n, comm_sz);// 计算本进程的颗粒间的引力作用
206 for (i = 1; i < comm_sz; i++)// 计算本进程的颗粒与收到的新颗粒之间的引力作用
207 {
208 other_proc = (my_rank + i) % comm_sz;// 每次交换信息的对象不同
209 MPI_Sendrecv_replace(tmp_data, 2 * loc_n, vect_mpi_t, dest, 0, src, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
210 Compute_proc_forces(masses, tmp_data, loc_forces, loc_pos, loc_n, my_rank, loc_n, other_proc, n, comm_sz);
211 }
212 MPI_Sendrecv_replace(tmp_data, 2 * loc_n, vect_mpi_t, dest, 0, src, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
213 for (loc_part = 0; loc_part < loc_n; loc_part++)
214 {
215 loc_forces[loc_part][X] += tmp_data[loc_n + loc_part][X];
216 loc_forces[loc_part][Y] += tmp_data[loc_n + loc_part][Y];
217 }
218 }
219
220 void Update_part(int loc_part, double masses[], vect_t loc_forces[], vect_t loc_pos[], vect_t loc_vel[], int n, int loc_n, double delta_t)
221 {
222 const int part = my_rank * loc_n + loc_part;
223 const double fact = delta_t / masses[part];
224 # ifdef DEBUG// 输出在更新前和更新后的数据
225 printf("Update_part before, part%2d %10.3e %10.3e %10.3e %10.3e %10.3e %10.3e\n",
226 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y], loc_forces[loc_part][X], loc_forces[loc_part][Y]);
227 fflush(stdout);
228 # endif
229 loc_pos[loc_part][X] += delta_t * loc_vel[loc_part][X];
230 loc_pos[loc_part][Y] += delta_t * loc_vel[loc_part][Y];
231 loc_vel[loc_part][X] += fact * loc_forces[loc_part][X];
232 loc_vel[loc_part][Y] += fact * loc_forces[loc_part][Y];
233 # ifdef DEBUG
234 printf("Update_part after, part%2d %10.3e %10.3e %10.3e %10.3e\n",
235 part, loc_pos[loc_part][X], loc_pos[loc_part][Y], loc_vel[loc_part][X], loc_vel[loc_part][Y]);
236 fflush(stdout);
237 # endif
238 }
239
240
241 int main(int argc, char* argv[])
242 {
243 int n, loc_n, loc_part;
244 int n_steps, step;
245 double delta_t;
246 int output_freq;
247 double *masses;
248 vect_t* loc_pos; // *pos 变成了全局变量
249 vect_t* tmp_data; // 用于环形交换的数据
250 vect_t* loc_vel;
251 vect_t* loc_forces;
252 char g_i;
253 double start, finish;
254
255 MPI_Init(&argc, &argv);
256 MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
257 MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
258 MPI_Type_contiguous(DIM, MPI_DOUBLE, &vect_mpi_t);
259 MPI_Type_commit(&vect_mpi_t);
260
261 Get_args(argc, argv, &n, &n_steps, &delta_t, &output_freq, &g_i);
262 loc_n = n / comm_sz;
263 Build_cyclic_mpi_type(loc_n);
264 masses = (double*)malloc(n * sizeof(double));
265 tmp_data = (vect_t*)malloc(2 * loc_n * sizeof(vect_t)); // 前半段 tmp_pos 后半段 temp_force
266 loc_pos = (vect_t*)malloc(loc_n * sizeof(vect_t));
267 loc_vel = (vect_t*)malloc(loc_n * sizeof(vect_t));
268 loc_forces = (vect_t*)malloc(loc_n * sizeof(vect_t));
269 if (my_rank == 0)
270 {
271 pos = (vect_t*)malloc(n * sizeof(vect_t));
272 vel = (vect_t*)malloc(n * sizeof(vect_t));
273 }
274 if (g_i == 'g')
275 Gen_init_cond(masses, loc_pos, loc_vel, n, loc_n);
276 else
277 Get_init_cond(masses, loc_pos, loc_vel, n, loc_n);
278
279 if(my_rank == 0)
280 start = MPI_Wtime();
281 # ifdef OUTPUT
282 Output_state(0.0, masses, loc_pos, loc_vel, n, loc_n);
283 # endif
284 for (step = 1; step <= n_steps; step++)
285 {
286 Compute_forces(masses, tmp_data, loc_forces, loc_pos, n, loc_n);
287 for (loc_part = 0; loc_part < loc_n; Update_part(loc_part++, masses, loc_forces, loc_pos, loc_vel, n, loc_n, delta_t));
288 # ifdef OUTPUT
289 if (step % output_freq == 0)
290 Output_state(step * delta_t, masses, loc_pos, loc_vel, n, loc_n);
291 # endif
292 }
293
294 if (my_rank == 0)
295 {
296 finish = MPI_Wtime();
297 printf("Elapsed time = %e ms\n", (finish - start) * 1000);
298 free(pos);
299 free(vel);
300 }
301 MPI_Type_free(&vect_mpi_t);
302 MPI_Type_free(&cyclic_mpi_t);
303 free(masses);
304 free(tmp_data);
305 free(loc_forces);
306 free(loc_pos);
307 free(loc_vel);
308 MPI_Finalize();
309 return 0;
310 }
● 输出结果,与基本算法类似。8 进程 16 体,3 秒,时间步长 1 秒,舍去 debug 输出 1.476754e+00 ms;8 进程 1024 体,3600 秒,时间步长 1 秒,舍去 output 和 debug 输出 1.329172e+04 ms,在较大数据规模上稍有优势