LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 #if OMP_PROFILING_SUPPORT
36 #include "llvm/Support/TimeProfiler.h"
37 static char *ProfileTraceFile = nullptr;
38 #endif
39 
40 /* these are temporary issues to be dealt with */
41 #define KMP_USE_PRCTL 0
42 
43 #if KMP_OS_WINDOWS
44 #include <process.h>
45 #endif
46 
47 #include "tsan_annotations.h"
48 
49 #if KMP_OS_WINDOWS
50 // windows does not need include files as it doesn't use shared memory
51 #else
52 #include <sys/mman.h>
53 #include <sys/stat.h>
54 #include <fcntl.h>
55 #define SHM_SIZE 1024
56 #endif
57 
58 #if defined(KMP_GOMP_COMPAT)
59 char const __kmp_version_alt_comp[] =
60  KMP_VERSION_PREFIX "alternative compiler support: yes";
61 #endif /* defined(KMP_GOMP_COMPAT) */
62 
63 char const __kmp_version_omp_api[] =
64  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
65 
66 #ifdef KMP_DEBUG
67 char const __kmp_version_lock[] =
68  KMP_VERSION_PREFIX "lock type: run time selectable";
69 #endif /* KMP_DEBUG */
70 
71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
72 
73 /* ------------------------------------------------------------------------ */
74 
75 #if KMP_USE_MONITOR
76 kmp_info_t __kmp_monitor;
77 #endif
78 
79 /* Forward declarations */
80 
81 void __kmp_cleanup(void);
82 
83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
84  int gtid);
85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
86  kmp_internal_control_t *new_icvs,
87  ident_t *loc);
88 #if KMP_AFFINITY_SUPPORTED
89 static void __kmp_partition_places(kmp_team_t *team,
90  int update_master_only = 0);
91 #endif
92 static void __kmp_do_serial_initialize(void);
93 void __kmp_fork_barrier(int gtid, int tid);
94 void __kmp_join_barrier(int gtid);
95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
96  kmp_internal_control_t *new_icvs, ident_t *loc);
97 
98 #ifdef USE_LOAD_BALANCE
99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
100 #endif
101 
102 static int __kmp_expand_threads(int nNeed);
103 #if KMP_OS_WINDOWS
104 static int __kmp_unregister_root_other_thread(int gtid);
105 #endif
106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
108 
109 /* Calculate the identifier of the current thread */
110 /* fast (and somewhat portable) way to get unique identifier of executing
111  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
112 int __kmp_get_global_thread_id() {
113  int i;
114  kmp_info_t **other_threads;
115  size_t stack_data;
116  char *stack_addr;
117  size_t stack_size;
118  char *stack_base;
119 
120  KA_TRACE(
121  1000,
122  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
123  __kmp_nth, __kmp_all_nth));
124 
125  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
126  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
127  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
128  __kmp_init_gtid for this to work. */
129 
130  if (!TCR_4(__kmp_init_gtid))
131  return KMP_GTID_DNE;
132 
133 #ifdef KMP_TDATA_GTID
134  if (TCR_4(__kmp_gtid_mode) >= 3) {
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
136  return __kmp_gtid;
137  }
138 #endif
139  if (TCR_4(__kmp_gtid_mode) >= 2) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
141  return __kmp_gtid_get_specific();
142  }
143  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
144 
145  stack_addr = (char *)&stack_data;
146  other_threads = __kmp_threads;
147 
148  /* ATT: The code below is a source of potential bugs due to unsynchronized
149  access to __kmp_threads array. For example:
150  1. Current thread loads other_threads[i] to thr and checks it, it is
151  non-NULL.
152  2. Current thread is suspended by OS.
153  3. Another thread unregisters and finishes (debug versions of free()
154  may fill memory with something like 0xEF).
155  4. Current thread is resumed.
156  5. Current thread reads junk from *thr.
157  TODO: Fix it. --ln */
158 
159  for (i = 0; i < __kmp_threads_capacity; i++) {
160 
161  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
162  if (!thr)
163  continue;
164 
165  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
166  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
167 
168  /* stack grows down -- search through all of the active threads */
169 
170  if (stack_addr <= stack_base) {
171  size_t stack_diff = stack_base - stack_addr;
172 
173  if (stack_diff <= stack_size) {
174  /* The only way we can be closer than the allocated */
175  /* stack size is if we are running on this thread. */
176  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
177  return i;
178  }
179  }
180  }
181 
182  /* get specific to try and determine our gtid */
183  KA_TRACE(1000,
184  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
185  "thread, using TLS\n"));
186  i = __kmp_gtid_get_specific();
187 
188  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
189 
190  /* if we havn't been assigned a gtid, then return code */
191  if (i < 0)
192  return i;
193 
194  /* dynamically updated stack window for uber threads to avoid get_specific
195  call */
196  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
197  KMP_FATAL(StackOverflow, i);
198  }
199 
200  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
201  if (stack_addr > stack_base) {
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
205  stack_base);
206  } else {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
208  stack_base - stack_addr);
209  }
210 
211  /* Reprint stack bounds for ubermaster since they have been refined */
212  if (__kmp_storage_map) {
213  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
214  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
215  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
216  other_threads[i]->th.th_info.ds.ds_stacksize,
217  "th_%d stack (refinement)", i);
218  }
219  return i;
220 }
221 
222 int __kmp_get_global_thread_id_reg() {
223  int gtid;
224 
225  if (!__kmp_init_serial) {
226  gtid = KMP_GTID_DNE;
227  } else
228 #ifdef KMP_TDATA_GTID
229  if (TCR_4(__kmp_gtid_mode) >= 3) {
230  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
231  gtid = __kmp_gtid;
232  } else
233 #endif
234  if (TCR_4(__kmp_gtid_mode) >= 2) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
236  gtid = __kmp_gtid_get_specific();
237  } else {
238  KA_TRACE(1000,
239  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
240  gtid = __kmp_get_global_thread_id();
241  }
242 
243  /* we must be a new uber master sibling thread */
244  if (gtid == KMP_GTID_DNE) {
245  KA_TRACE(10,
246  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
247  "Registering a new gtid.\n"));
248  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
249  if (!__kmp_init_serial) {
250  __kmp_do_serial_initialize();
251  gtid = __kmp_gtid_get_specific();
252  } else {
253  gtid = __kmp_register_root(FALSE);
254  }
255  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
256  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
257  }
258 
259  KMP_DEBUG_ASSERT(gtid >= 0);
260 
261  return gtid;
262 }
263 
264 /* caller must hold forkjoin_lock */
265 void __kmp_check_stack_overlap(kmp_info_t *th) {
266  int f;
267  char *stack_beg = NULL;
268  char *stack_end = NULL;
269  int gtid;
270 
271  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
272  if (__kmp_storage_map) {
273  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
274  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
275 
276  gtid = __kmp_gtid_from_thread(th);
277 
278  if (gtid == KMP_GTID_MONITOR) {
279  __kmp_print_storage_map_gtid(
280  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%s stack (%s)", "mon",
282  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283  } else {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%d stack (%s)", gtid,
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  }
289  }
290 
291  /* No point in checking ubermaster threads since they use refinement and
292  * cannot overlap */
293  gtid = __kmp_gtid_from_thread(th);
294  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
295  KA_TRACE(10,
296  ("__kmp_check_stack_overlap: performing extensive checking\n"));
297  if (stack_beg == NULL) {
298  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
299  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
300  }
301 
302  for (f = 0; f < __kmp_threads_capacity; f++) {
303  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
304 
305  if (f_th && f_th != th) {
306  char *other_stack_end =
307  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
308  char *other_stack_beg =
309  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
310  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
311  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
312 
313  /* Print the other stack values before the abort */
314  if (__kmp_storage_map)
315  __kmp_print_storage_map_gtid(
316  -1, other_stack_beg, other_stack_end,
317  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
318  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
319 
320  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
321  __kmp_msg_null);
322  }
323  }
324  }
325  }
326  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
327 }
328 
329 /* ------------------------------------------------------------------------ */
330 
331 void __kmp_infinite_loop(void) {
332  static int done = FALSE;
333 
334  while (!done) {
335  KMP_YIELD(TRUE);
336  }
337 }
338 
339 #define MAX_MESSAGE 512
340 
341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
342  char const *format, ...) {
343  char buffer[MAX_MESSAGE];
344  va_list ap;
345 
346  va_start(ap, format);
347  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
348  p2, (unsigned long)size, format);
349  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
350  __kmp_vprintf(kmp_err, buffer, ap);
351 #if KMP_PRINT_DATA_PLACEMENT
352  int node;
353  if (gtid >= 0) {
354  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
355  if (__kmp_storage_map_verbose) {
356  node = __kmp_get_host_node(p1);
357  if (node < 0) /* doesn't work, so don't try this next time */
358  __kmp_storage_map_verbose = FALSE;
359  else {
360  char *last;
361  int lastNode;
362  int localProc = __kmp_get_cpu_from_gtid(gtid);
363 
364  const int page_size = KMP_GET_PAGE_SIZE();
365 
366  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
367  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
368  if (localProc >= 0)
369  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
370  localProc >> 1);
371  else
372  __kmp_printf_no_lock(" GTID %d\n", gtid);
373 #if KMP_USE_PRCTL
374  /* The more elaborate format is disabled for now because of the prctl
375  * hanging bug. */
376  do {
377  last = p1;
378  lastNode = node;
379  /* This loop collates adjacent pages with the same host node. */
380  do {
381  (char *)p1 += page_size;
382  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
384  lastNode);
385  } while (p1 <= p2);
386 #else
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
388  (char *)p1 + (page_size - 1),
389  __kmp_get_host_node(p1));
390  if (p1 < p2) {
391  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
392  (char *)p2 + (page_size - 1),
393  __kmp_get_host_node(p2));
394  }
395 #endif
396  }
397  }
398  } else
399  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
400  }
401 #endif /* KMP_PRINT_DATA_PLACEMENT */
402  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
403 }
404 
405 void __kmp_warn(char const *format, ...) {
406  char buffer[MAX_MESSAGE];
407  va_list ap;
408 
409  if (__kmp_generate_warnings == kmp_warnings_off) {
410  return;
411  }
412 
413  va_start(ap, format);
414 
415  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
416  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
417  __kmp_vprintf(kmp_err, buffer, ap);
418  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419 
420  va_end(ap);
421 }
422 
423 void __kmp_abort_process() {
424  // Later threads may stall here, but that's ok because abort() will kill them.
425  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
426 
427  if (__kmp_debug_buf) {
428  __kmp_dump_debug_buffer();
429  }
430 
431  if (KMP_OS_WINDOWS) {
432  // Let other threads know of abnormal termination and prevent deadlock
433  // if abort happened during library initialization or shutdown
434  __kmp_global.g.g_abort = SIGABRT;
435 
436  /* On Windows* OS by default abort() causes pop-up error box, which stalls
437  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
438  boxes. _set_abort_behavior() works well, but this function is not
439  available in VS7 (this is not problem for DLL, but it is a problem for
440  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
441  help, at least in some versions of MS C RTL.
442 
443  It seems following sequence is the only way to simulate abort() and
444  avoid pop-up error box. */
445  raise(SIGABRT);
446  _exit(3); // Just in case, if signal ignored, exit anyway.
447  } else {
448  __kmp_unregister_library();
449  abort();
450  }
451 
452  __kmp_infinite_loop();
453  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
454 
455 } // __kmp_abort_process
456 
457 void __kmp_abort_thread(void) {
458  // TODO: Eliminate g_abort global variable and this function.
459  // In case of abort just call abort(), it will kill all the threads.
460  __kmp_infinite_loop();
461 } // __kmp_abort_thread
462 
463 /* Print out the storage map for the major kmp_info_t thread data structures
464  that are allocated together. */
465 
466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
467  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
468  gtid);
469 
470  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
471  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
472 
473  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
474  sizeof(kmp_local_t), "th_%d.th_local", gtid);
475 
476  __kmp_print_storage_map_gtid(
477  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
478  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
481  &thr->th.th_bar[bs_plain_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
483  gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
486  &thr->th.th_bar[bs_forkjoin_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
488  gtid);
489 
490 #if KMP_FAST_REDUCTION_BARRIER
491  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
492  &thr->th.th_bar[bs_reduction_barrier + 1],
493  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
494  gtid);
495 #endif // KMP_FAST_REDUCTION_BARRIER
496 }
497 
498 /* Print out the storage map for the major kmp_team_t team data structures
499  that are allocated together. */
500 
501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
502  int team_id, int num_thr) {
503  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
504  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
505  header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
508  &team->t.t_bar[bs_last_barrier],
509  sizeof(kmp_balign_team_t) * bs_last_barrier,
510  "%s_%d.t_bar", header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
513  &team->t.t_bar[bs_plain_barrier + 1],
514  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
515  header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
518  &team->t.t_bar[bs_forkjoin_barrier + 1],
519  sizeof(kmp_balign_team_t),
520  "%s_%d.t_bar[forkjoin]", header, team_id);
521 
522 #if KMP_FAST_REDUCTION_BARRIER
523  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
524  &team->t.t_bar[bs_reduction_barrier + 1],
525  sizeof(kmp_balign_team_t),
526  "%s_%d.t_bar[reduction]", header, team_id);
527 #endif // KMP_FAST_REDUCTION_BARRIER
528 
529  __kmp_print_storage_map_gtid(
530  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
531  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
532 
533  __kmp_print_storage_map_gtid(
534  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
535  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
538  &team->t.t_disp_buffer[num_disp_buff],
539  sizeof(dispatch_shared_info_t) * num_disp_buff,
540  "%s_%d.t_disp_buffer", header, team_id);
541 }
542 
543 static void __kmp_init_allocator() {
544  __kmp_init_memkind();
545  __kmp_init_target_mem();
546 }
547 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
548 
549 /* ------------------------------------------------------------------------ */
550 
551 #if KMP_DYNAMIC_LIB
552 #if KMP_OS_WINDOWS
553 
554 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
555  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
556 
557  switch (fdwReason) {
558 
559  case DLL_PROCESS_ATTACH:
560  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
561 
562  return TRUE;
563 
564  case DLL_PROCESS_DETACH:
565  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
566 
567  // According to Windows* documentation for DllMain entry point:
568  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
569  // lpReserved == NULL when FreeLibrary() is called,
570  // lpReserved != NULL when the process is terminated.
571  // When FreeLibrary() is called, worker threads remain alive. So the
572  // runtime's state is consistent and executing proper shutdown is OK.
573  // When the process is terminated, worker threads have exited or been
574  // forcefully terminated by the OS and only the shutdown thread remains.
575  // This can leave the runtime in an inconsistent state.
576  // Hence, only attempt proper cleanup when FreeLibrary() is called.
577  // Otherwise, rely on OS to reclaim resources.
578  if (lpReserved == NULL)
579  __kmp_internal_end_library(__kmp_gtid_get_specific());
580 
581  return TRUE;
582 
583  case DLL_THREAD_ATTACH:
584  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
585 
586  /* if we want to register new siblings all the time here call
587  * __kmp_get_gtid(); */
588  return TRUE;
589 
590  case DLL_THREAD_DETACH:
591  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
592 
593  __kmp_internal_end_thread(__kmp_gtid_get_specific());
594  return TRUE;
595  }
596 
597  return TRUE;
598 }
599 
600 #endif /* KMP_OS_WINDOWS */
601 #endif /* KMP_DYNAMIC_LIB */
602 
603 /* __kmp_parallel_deo -- Wait until it's our turn. */
604 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
605  int gtid = *gtid_ref;
606 #ifdef BUILD_PARALLEL_ORDERED
607  kmp_team_t *team = __kmp_team_from_gtid(gtid);
608 #endif /* BUILD_PARALLEL_ORDERED */
609 
610  if (__kmp_env_consistency_check) {
611  if (__kmp_threads[gtid]->th.th_root->r.r_active)
612 #if KMP_USE_DYNAMIC_LOCK
613  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
614 #else
615  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
616 #endif
617  }
618 #ifdef BUILD_PARALLEL_ORDERED
619  if (!team->t.t_serialized) {
620  KMP_MB();
621  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
622  NULL);
623  KMP_MB();
624  }
625 #endif /* BUILD_PARALLEL_ORDERED */
626 }
627 
628 /* __kmp_parallel_dxo -- Signal the next task. */
629 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
630  int gtid = *gtid_ref;
631 #ifdef BUILD_PARALLEL_ORDERED
632  int tid = __kmp_tid_from_gtid(gtid);
633  kmp_team_t *team = __kmp_team_from_gtid(gtid);
634 #endif /* BUILD_PARALLEL_ORDERED */
635 
636  if (__kmp_env_consistency_check) {
637  if (__kmp_threads[gtid]->th.th_root->r.r_active)
638  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
639  }
640 #ifdef BUILD_PARALLEL_ORDERED
641  if (!team->t.t_serialized) {
642  KMP_MB(); /* Flush all pending memory write invalidates. */
643 
644  /* use the tid of the next thread in this team */
645  /* TODO replace with general release procedure */
646  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
647 
648  KMP_MB(); /* Flush all pending memory write invalidates. */
649  }
650 #endif /* BUILD_PARALLEL_ORDERED */
651 }
652 
653 /* ------------------------------------------------------------------------ */
654 /* The BARRIER for a SINGLE process section is always explicit */
655 
656 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
657  int status;
658  kmp_info_t *th;
659  kmp_team_t *team;
660 
661  if (!TCR_4(__kmp_init_parallel))
662  __kmp_parallel_initialize();
663  __kmp_resume_if_soft_paused();
664 
665  th = __kmp_threads[gtid];
666  team = th->th.th_team;
667  status = 0;
668 
669  th->th.th_ident = id_ref;
670 
671  if (team->t.t_serialized) {
672  status = 1;
673  } else {
674  kmp_int32 old_this = th->th.th_local.this_construct;
675 
676  ++th->th.th_local.this_construct;
677  /* try to set team count to thread count--success means thread got the
678  single block */
679  /* TODO: Should this be acquire or release? */
680  if (team->t.t_construct == old_this) {
681  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
682  th->th.th_local.this_construct);
683  }
684 #if USE_ITT_BUILD
685  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
686  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
687  team->t.t_active_level == 1) {
688  // Only report metadata by primary thread of active team at level 1
689  __kmp_itt_metadata_single(id_ref);
690  }
691 #endif /* USE_ITT_BUILD */
692  }
693 
694  if (__kmp_env_consistency_check) {
695  if (status && push_ws) {
696  __kmp_push_workshare(gtid, ct_psingle, id_ref);
697  } else {
698  __kmp_check_workshare(gtid, ct_psingle, id_ref);
699  }
700  }
701 #if USE_ITT_BUILD
702  if (status) {
703  __kmp_itt_single_start(gtid);
704  }
705 #endif /* USE_ITT_BUILD */
706  return status;
707 }
708 
709 void __kmp_exit_single(int gtid) {
710 #if USE_ITT_BUILD
711  __kmp_itt_single_end(gtid);
712 #endif /* USE_ITT_BUILD */
713  if (__kmp_env_consistency_check)
714  __kmp_pop_workshare(gtid, ct_psingle, NULL);
715 }
716 
717 /* determine if we can go parallel or must use a serialized parallel region and
718  * how many threads we can use
719  * set_nproc is the number of threads requested for the team
720  * returns 0 if we should serialize or only use one thread,
721  * otherwise the number of threads to use
722  * The forkjoin lock is held by the caller. */
723 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
724  int master_tid, int set_nthreads,
725  int enter_teams) {
726  int capacity;
727  int new_nthreads;
728  KMP_DEBUG_ASSERT(__kmp_init_serial);
729  KMP_DEBUG_ASSERT(root && parent_team);
730  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
731 
732  // If dyn-var is set, dynamically adjust the number of desired threads,
733  // according to the method specified by dynamic_mode.
734  new_nthreads = set_nthreads;
735  if (!get__dynamic_2(parent_team, master_tid)) {
736  ;
737  }
738 #ifdef USE_LOAD_BALANCE
739  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
740  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
741  if (new_nthreads == 1) {
742  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
743  "reservation to 1 thread\n",
744  master_tid));
745  return 1;
746  }
747  if (new_nthreads < set_nthreads) {
748  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
749  "reservation to %d threads\n",
750  master_tid, new_nthreads));
751  }
752  }
753 #endif /* USE_LOAD_BALANCE */
754  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
755  new_nthreads = __kmp_avail_proc - __kmp_nth +
756  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
757  if (new_nthreads <= 1) {
758  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
759  "reservation to 1 thread\n",
760  master_tid));
761  return 1;
762  }
763  if (new_nthreads < set_nthreads) {
764  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
765  "reservation to %d threads\n",
766  master_tid, new_nthreads));
767  } else {
768  new_nthreads = set_nthreads;
769  }
770  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
771  if (set_nthreads > 2) {
772  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
773  new_nthreads = (new_nthreads % set_nthreads) + 1;
774  if (new_nthreads == 1) {
775  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
776  "reservation to 1 thread\n",
777  master_tid));
778  return 1;
779  }
780  if (new_nthreads < set_nthreads) {
781  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
782  "reservation to %d threads\n",
783  master_tid, new_nthreads));
784  }
785  }
786  } else {
787  KMP_ASSERT(0);
788  }
789 
790  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
791  if (__kmp_nth + new_nthreads -
792  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
793  __kmp_max_nth) {
794  int tl_nthreads = __kmp_max_nth - __kmp_nth +
795  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
796  if (tl_nthreads <= 0) {
797  tl_nthreads = 1;
798  }
799 
800  // If dyn-var is false, emit a 1-time warning.
801  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
802  __kmp_reserve_warn = 1;
803  __kmp_msg(kmp_ms_warning,
804  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
805  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
806  }
807  if (tl_nthreads == 1) {
808  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
809  "reduced reservation to 1 thread\n",
810  master_tid));
811  return 1;
812  }
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
814  "reservation to %d threads\n",
815  master_tid, tl_nthreads));
816  new_nthreads = tl_nthreads;
817  }
818 
819  // Respect OMP_THREAD_LIMIT
820  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
821  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
822  if (cg_nthreads + new_nthreads -
823  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
824  max_cg_threads) {
825  int tl_nthreads = max_cg_threads - cg_nthreads +
826  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
827  if (tl_nthreads <= 0) {
828  tl_nthreads = 1;
829  }
830 
831  // If dyn-var is false, emit a 1-time warning.
832  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
833  __kmp_reserve_warn = 1;
834  __kmp_msg(kmp_ms_warning,
835  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
836  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
837  }
838  if (tl_nthreads == 1) {
839  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
840  "reduced reservation to 1 thread\n",
841  master_tid));
842  return 1;
843  }
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
845  "reservation to %d threads\n",
846  master_tid, tl_nthreads));
847  new_nthreads = tl_nthreads;
848  }
849 
850  // Check if the threads array is large enough, or needs expanding.
851  // See comment in __kmp_register_root() about the adjustment if
852  // __kmp_threads[0] == NULL.
853  capacity = __kmp_threads_capacity;
854  if (TCR_PTR(__kmp_threads[0]) == NULL) {
855  --capacity;
856  }
857  // If it is not for initializing the hidden helper team, we need to take
858  // __kmp_hidden_helper_threads_num out of the capacity because it is included
859  // in __kmp_threads_capacity.
860  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
861  capacity -= __kmp_hidden_helper_threads_num;
862  }
863  if (__kmp_nth + new_nthreads -
864  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
865  capacity) {
866  // Expand the threads array.
867  int slotsRequired = __kmp_nth + new_nthreads -
868  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
869  capacity;
870  int slotsAdded = __kmp_expand_threads(slotsRequired);
871  if (slotsAdded < slotsRequired) {
872  // The threads array was not expanded enough.
873  new_nthreads -= (slotsRequired - slotsAdded);
874  KMP_ASSERT(new_nthreads >= 1);
875 
876  // If dyn-var is false, emit a 1-time warning.
877  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
878  __kmp_reserve_warn = 1;
879  if (__kmp_tp_cached) {
880  __kmp_msg(kmp_ms_warning,
881  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
882  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
883  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
884  } else {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
888  }
889  }
890  }
891  }
892 
893 #ifdef KMP_DEBUG
894  if (new_nthreads == 1) {
895  KC_TRACE(10,
896  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
897  "dead roots and rechecking; requested %d threads\n",
898  __kmp_get_gtid(), set_nthreads));
899  } else {
900  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
901  " %d threads\n",
902  __kmp_get_gtid(), new_nthreads, set_nthreads));
903  }
904 #endif // KMP_DEBUG
905  return new_nthreads;
906 }
907 
908 /* Allocate threads from the thread pool and assign them to the new team. We are
909  assured that there are enough threads available, because we checked on that
910  earlier within critical section forkjoin */
911 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
912  kmp_info_t *master_th, int master_gtid) {
913  int i;
914  int use_hot_team;
915 
916  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
917  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
918  KMP_MB();
919 
920  /* first, let's setup the primary thread */
921  master_th->th.th_info.ds.ds_tid = 0;
922  master_th->th.th_team = team;
923  master_th->th.th_team_nproc = team->t.t_nproc;
924  master_th->th.th_team_master = master_th;
925  master_th->th.th_team_serialized = FALSE;
926  master_th->th.th_dispatch = &team->t.t_dispatch[0];
927 
928 /* make sure we are not the optimized hot team */
929 #if KMP_NESTED_HOT_TEAMS
930  use_hot_team = 0;
931  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
932  if (hot_teams) { // hot teams array is not allocated if
933  // KMP_HOT_TEAMS_MAX_LEVEL=0
934  int level = team->t.t_active_level - 1; // index in array of hot teams
935  if (master_th->th.th_teams_microtask) { // are we inside the teams?
936  if (master_th->th.th_teams_size.nteams > 1) {
937  ++level; // level was not increased in teams construct for
938  // team_of_masters
939  }
940  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
941  master_th->th.th_teams_level == team->t.t_level) {
942  ++level; // level was not increased in teams construct for
943  // team_of_workers before the parallel
944  } // team->t.t_level will be increased inside parallel
945  }
946  if (level < __kmp_hot_teams_max_level) {
947  if (hot_teams[level].hot_team) {
948  // hot team has already been allocated for given level
949  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
950  use_hot_team = 1; // the team is ready to use
951  } else {
952  use_hot_team = 0; // AC: threads are not allocated yet
953  hot_teams[level].hot_team = team; // remember new hot team
954  hot_teams[level].hot_team_nth = team->t.t_nproc;
955  }
956  } else {
957  use_hot_team = 0;
958  }
959  }
960 #else
961  use_hot_team = team == root->r.r_hot_team;
962 #endif
963  if (!use_hot_team) {
964 
965  /* install the primary thread */
966  team->t.t_threads[0] = master_th;
967  __kmp_initialize_info(master_th, team, 0, master_gtid);
968 
969  /* now, install the worker threads */
970  for (i = 1; i < team->t.t_nproc; i++) {
971 
972  /* fork or reallocate a new thread and install it in team */
973  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
974  team->t.t_threads[i] = thr;
975  KMP_DEBUG_ASSERT(thr);
976  KMP_DEBUG_ASSERT(thr->th.th_team == team);
977  /* align team and thread arrived states */
978  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
979  "T#%d(%d:%d) join =%llu, plain=%llu\n",
980  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
981  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
982  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
983  team->t.t_bar[bs_plain_barrier].b_arrived));
984  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
985  thr->th.th_teams_level = master_th->th.th_teams_level;
986  thr->th.th_teams_size = master_th->th.th_teams_size;
987  { // Initialize threads' barrier data.
988  int b;
989  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
990  for (b = 0; b < bs_last_barrier; ++b) {
991  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
992  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
993 #if USE_DEBUGGER
994  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
995 #endif
996  }
997  }
998  }
999 
1000 #if KMP_AFFINITY_SUPPORTED
1001  __kmp_partition_places(team);
1002 #endif
1003  }
1004 
1005  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1006  for (i = 0; i < team->t.t_nproc; i++) {
1007  kmp_info_t *thr = team->t.t_threads[i];
1008  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1009  thr->th.th_prev_level != team->t.t_level) {
1010  team->t.t_display_affinity = 1;
1011  break;
1012  }
1013  }
1014  }
1015 
1016  KMP_MB();
1017 }
1018 
1019 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1020 // Propagate any changes to the floating point control registers out to the team
1021 // We try to avoid unnecessary writes to the relevant cache line in the team
1022 // structure, so we don't make changes unless they are needed.
1023 inline static void propagateFPControl(kmp_team_t *team) {
1024  if (__kmp_inherit_fp_control) {
1025  kmp_int16 x87_fpu_control_word;
1026  kmp_uint32 mxcsr;
1027 
1028  // Get primary thread's values of FPU control flags (both X87 and vector)
1029  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1030  __kmp_store_mxcsr(&mxcsr);
1031  mxcsr &= KMP_X86_MXCSR_MASK;
1032 
1033  // There is no point looking at t_fp_control_saved here.
1034  // If it is TRUE, we still have to update the values if they are different
1035  // from those we now have. If it is FALSE we didn't save anything yet, but
1036  // our objective is the same. We have to ensure that the values in the team
1037  // are the same as those we have.
1038  // So, this code achieves what we need whether or not t_fp_control_saved is
1039  // true. By checking whether the value needs updating we avoid unnecessary
1040  // writes that would put the cache-line into a written state, causing all
1041  // threads in the team to have to read it again.
1042  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1043  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1044  // Although we don't use this value, other code in the runtime wants to know
1045  // whether it should restore them. So we must ensure it is correct.
1046  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1047  } else {
1048  // Similarly here. Don't write to this cache-line in the team structure
1049  // unless we have to.
1050  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1051  }
1052 }
1053 
1054 // Do the opposite, setting the hardware registers to the updated values from
1055 // the team.
1056 inline static void updateHWFPControl(kmp_team_t *team) {
1057  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1058  // Only reset the fp control regs if they have been changed in the team.
1059  // the parallel region that we are exiting.
1060  kmp_int16 x87_fpu_control_word;
1061  kmp_uint32 mxcsr;
1062  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1063  __kmp_store_mxcsr(&mxcsr);
1064  mxcsr &= KMP_X86_MXCSR_MASK;
1065 
1066  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1067  __kmp_clear_x87_fpu_status_word();
1068  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1069  }
1070 
1071  if (team->t.t_mxcsr != mxcsr) {
1072  __kmp_load_mxcsr(&team->t.t_mxcsr);
1073  }
1074  }
1075 }
1076 #else
1077 #define propagateFPControl(x) ((void)0)
1078 #define updateHWFPControl(x) ((void)0)
1079 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1080 
1081 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1082  int realloc); // forward declaration
1083 
1084 /* Run a parallel region that has been serialized, so runs only in a team of the
1085  single primary thread. */
1086 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1087  kmp_info_t *this_thr;
1088  kmp_team_t *serial_team;
1089 
1090  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1091 
1092  /* Skip all this code for autopar serialized loops since it results in
1093  unacceptable overhead */
1094  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1095  return;
1096 
1097  if (!TCR_4(__kmp_init_parallel))
1098  __kmp_parallel_initialize();
1099  __kmp_resume_if_soft_paused();
1100 
1101  this_thr = __kmp_threads[global_tid];
1102  serial_team = this_thr->th.th_serial_team;
1103 
1104  /* utilize the serialized team held by this thread */
1105  KMP_DEBUG_ASSERT(serial_team);
1106  KMP_MB();
1107 
1108  if (__kmp_tasking_mode != tskm_immediate_exec) {
1109  KMP_DEBUG_ASSERT(
1110  this_thr->th.th_task_team ==
1111  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1112  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1113  NULL);
1114  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1115  "team %p, new task_team = NULL\n",
1116  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1117  this_thr->th.th_task_team = NULL;
1118  }
1119 
1120  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1121  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1122  proc_bind = proc_bind_false;
1123  } else if (proc_bind == proc_bind_default) {
1124  // No proc_bind clause was specified, so use the current value
1125  // of proc-bind-var for this parallel region.
1126  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1127  }
1128  // Reset for next parallel region
1129  this_thr->th.th_set_proc_bind = proc_bind_default;
1130 
1131 #if OMPT_SUPPORT
1132  ompt_data_t ompt_parallel_data = ompt_data_none;
1133  ompt_data_t *implicit_task_data;
1134  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1135  if (ompt_enabled.enabled &&
1136  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1137 
1138  ompt_task_info_t *parent_task_info;
1139  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1140 
1141  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1142  if (ompt_enabled.ompt_callback_parallel_begin) {
1143  int team_size = 1;
1144 
1145  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1146  &(parent_task_info->task_data), &(parent_task_info->frame),
1147  &ompt_parallel_data, team_size,
1148  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1149  }
1150  }
1151 #endif // OMPT_SUPPORT
1152 
1153  if (this_thr->th.th_team != serial_team) {
1154  // Nested level will be an index in the nested nthreads array
1155  int level = this_thr->th.th_team->t.t_level;
1156 
1157  if (serial_team->t.t_serialized) {
1158  /* this serial team was already used
1159  TODO increase performance by making this locks more specific */
1160  kmp_team_t *new_team;
1161 
1162  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1163 
1164  new_team =
1165  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1166 #if OMPT_SUPPORT
1167  ompt_parallel_data,
1168 #endif
1169  proc_bind, &this_thr->th.th_current_task->td_icvs,
1170  0 USE_NESTED_HOT_ARG(NULL));
1171  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1172  KMP_ASSERT(new_team);
1173 
1174  /* setup new serialized team and install it */
1175  new_team->t.t_threads[0] = this_thr;
1176  new_team->t.t_parent = this_thr->th.th_team;
1177  serial_team = new_team;
1178  this_thr->th.th_serial_team = serial_team;
1179 
1180  KF_TRACE(
1181  10,
1182  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1183  global_tid, serial_team));
1184 
1185  /* TODO the above breaks the requirement that if we run out of resources,
1186  then we can still guarantee that serialized teams are ok, since we may
1187  need to allocate a new one */
1188  } else {
1189  KF_TRACE(
1190  10,
1191  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1192  global_tid, serial_team));
1193  }
1194 
1195  /* we have to initialize this serial team */
1196  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1197  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1198  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1199  serial_team->t.t_ident = loc;
1200  serial_team->t.t_serialized = 1;
1201  serial_team->t.t_nproc = 1;
1202  serial_team->t.t_parent = this_thr->th.th_team;
1203  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1204  this_thr->th.th_team = serial_team;
1205  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1206 
1207  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1208  this_thr->th.th_current_task));
1209  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1210  this_thr->th.th_current_task->td_flags.executing = 0;
1211 
1212  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1213 
1214  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1215  implicit task for each serialized task represented by
1216  team->t.t_serialized? */
1217  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1218  &this_thr->th.th_current_task->td_parent->td_icvs);
1219 
1220  // Thread value exists in the nested nthreads array for the next nested
1221  // level
1222  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1223  this_thr->th.th_current_task->td_icvs.nproc =
1224  __kmp_nested_nth.nth[level + 1];
1225  }
1226 
1227  if (__kmp_nested_proc_bind.used &&
1228  (level + 1 < __kmp_nested_proc_bind.used)) {
1229  this_thr->th.th_current_task->td_icvs.proc_bind =
1230  __kmp_nested_proc_bind.bind_types[level + 1];
1231  }
1232 
1233 #if USE_DEBUGGER
1234  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1235 #endif
1236  this_thr->th.th_info.ds.ds_tid = 0;
1237 
1238  /* set thread cache values */
1239  this_thr->th.th_team_nproc = 1;
1240  this_thr->th.th_team_master = this_thr;
1241  this_thr->th.th_team_serialized = 1;
1242 
1243  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1244  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1245  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1246 
1247  propagateFPControl(serial_team);
1248 
1249  /* check if we need to allocate dispatch buffers stack */
1250  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1251  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1252  serial_team->t.t_dispatch->th_disp_buffer =
1253  (dispatch_private_info_t *)__kmp_allocate(
1254  sizeof(dispatch_private_info_t));
1255  }
1256  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1257 
1258  KMP_MB();
1259 
1260  } else {
1261  /* this serialized team is already being used,
1262  * that's fine, just add another nested level */
1263  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1264  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1265  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1266  ++serial_team->t.t_serialized;
1267  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1268 
1269  // Nested level will be an index in the nested nthreads array
1270  int level = this_thr->th.th_team->t.t_level;
1271  // Thread value exists in the nested nthreads array for the next nested
1272  // level
1273  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1274  this_thr->th.th_current_task->td_icvs.nproc =
1275  __kmp_nested_nth.nth[level + 1];
1276  }
1277  serial_team->t.t_level++;
1278  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1279  "of serial team %p to %d\n",
1280  global_tid, serial_team, serial_team->t.t_level));
1281 
1282  /* allocate/push dispatch buffers stack */
1283  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1284  {
1285  dispatch_private_info_t *disp_buffer =
1286  (dispatch_private_info_t *)__kmp_allocate(
1287  sizeof(dispatch_private_info_t));
1288  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1289  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1290  }
1291  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1292 
1293  KMP_MB();
1294  }
1295  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1296 
1297  // Perform the display affinity functionality for
1298  // serialized parallel regions
1299  if (__kmp_display_affinity) {
1300  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1301  this_thr->th.th_prev_num_threads != 1) {
1302  // NULL means use the affinity-format-var ICV
1303  __kmp_aux_display_affinity(global_tid, NULL);
1304  this_thr->th.th_prev_level = serial_team->t.t_level;
1305  this_thr->th.th_prev_num_threads = 1;
1306  }
1307  }
1308 
1309  if (__kmp_env_consistency_check)
1310  __kmp_push_parallel(global_tid, NULL);
1311 #if OMPT_SUPPORT
1312  serial_team->t.ompt_team_info.master_return_address = codeptr;
1313  if (ompt_enabled.enabled &&
1314  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1315  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1316  OMPT_GET_FRAME_ADDRESS(0);
1317 
1318  ompt_lw_taskteam_t lw_taskteam;
1319  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1320  &ompt_parallel_data, codeptr);
1321 
1322  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1323  // don't use lw_taskteam after linking. content was swaped
1324 
1325  /* OMPT implicit task begin */
1326  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1327  if (ompt_enabled.ompt_callback_implicit_task) {
1328  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1329  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1330  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1331  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1332  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1333  __kmp_tid_from_gtid(global_tid);
1334  }
1335 
1336  /* OMPT state */
1337  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1338  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1339  OMPT_GET_FRAME_ADDRESS(0);
1340  }
1341 #endif
1342 }
1343 
1344 /* most of the work for a fork */
1345 /* return true if we really went parallel, false if serialized */
1346 int __kmp_fork_call(ident_t *loc, int gtid,
1347  enum fork_context_e call_context, // Intel, GNU, ...
1348  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1349  kmp_va_list ap) {
1350  void **argv;
1351  int i;
1352  int master_tid;
1353  int master_this_cons;
1354  kmp_team_t *team;
1355  kmp_team_t *parent_team;
1356  kmp_info_t *master_th;
1357  kmp_root_t *root;
1358  int nthreads;
1359  int master_active;
1360  int master_set_numthreads;
1361  int level;
1362  int active_level;
1363  int teams_level;
1364 #if KMP_NESTED_HOT_TEAMS
1365  kmp_hot_team_ptr_t **p_hot_teams;
1366 #endif
1367  { // KMP_TIME_BLOCK
1368  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1369  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1370 
1371  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1372  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1373  /* Some systems prefer the stack for the root thread(s) to start with */
1374  /* some gap from the parent stack to prevent false sharing. */
1375  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1376  /* These 2 lines below are so this does not get optimized out */
1377  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1378  __kmp_stkpadding += (short)((kmp_int64)dummy);
1379  }
1380 
1381  /* initialize if needed */
1382  KMP_DEBUG_ASSERT(
1383  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1384  if (!TCR_4(__kmp_init_parallel))
1385  __kmp_parallel_initialize();
1386  __kmp_resume_if_soft_paused();
1387 
1388  /* setup current data */
1389  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1390  // shutdown
1391  parent_team = master_th->th.th_team;
1392  master_tid = master_th->th.th_info.ds.ds_tid;
1393  master_this_cons = master_th->th.th_local.this_construct;
1394  root = master_th->th.th_root;
1395  master_active = root->r.r_active;
1396  master_set_numthreads = master_th->th.th_set_nproc;
1397 
1398 #if OMPT_SUPPORT
1399  ompt_data_t ompt_parallel_data = ompt_data_none;
1400  ompt_data_t *parent_task_data;
1401  ompt_frame_t *ompt_frame;
1402  ompt_data_t *implicit_task_data;
1403  void *return_address = NULL;
1404 
1405  if (ompt_enabled.enabled) {
1406  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1407  NULL, NULL);
1408  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1409  }
1410 #endif
1411 
1412  // Nested level will be an index in the nested nthreads array
1413  level = parent_team->t.t_level;
1414  // used to launch non-serial teams even if nested is not allowed
1415  active_level = parent_team->t.t_active_level;
1416  // needed to check nesting inside the teams
1417  teams_level = master_th->th.th_teams_level;
1418 #if KMP_NESTED_HOT_TEAMS
1419  p_hot_teams = &master_th->th.th_hot_teams;
1420  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1421  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1422  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1423  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1424  // it is either actual or not needed (when active_level > 0)
1425  (*p_hot_teams)[0].hot_team_nth = 1;
1426  }
1427 #endif
1428 
1429 #if OMPT_SUPPORT
1430  if (ompt_enabled.enabled) {
1431  if (ompt_enabled.ompt_callback_parallel_begin) {
1432  int team_size = master_set_numthreads
1433  ? master_set_numthreads
1434  : get__nproc_2(parent_team, master_tid);
1435  int flags = OMPT_INVOKER(call_context) |
1436  ((microtask == (microtask_t)__kmp_teams_master)
1437  ? ompt_parallel_league
1438  : ompt_parallel_team);
1439  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1440  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1441  return_address);
1442  }
1443  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1444  }
1445 #endif
1446 
1447  master_th->th.th_ident = loc;
1448 
1449  if (master_th->th.th_teams_microtask && ap &&
1450  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1451  // AC: This is start of parallel that is nested inside teams construct.
1452  // The team is actual (hot), all workers are ready at the fork barrier.
1453  // No lock needed to initialize the team a bit, then free workers.
1454  parent_team->t.t_ident = loc;
1455  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1456  parent_team->t.t_argc = argc;
1457  argv = (void **)parent_team->t.t_argv;
1458  for (i = argc - 1; i >= 0; --i)
1459  *argv++ = va_arg(kmp_va_deref(ap), void *);
1460  // Increment our nested depth levels, but not increase the serialization
1461  if (parent_team == master_th->th.th_serial_team) {
1462  // AC: we are in serialized parallel
1463  __kmpc_serialized_parallel(loc, gtid);
1464  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1465 
1466  if (call_context == fork_context_gnu) {
1467  // AC: need to decrement t_serialized for enquiry functions to work
1468  // correctly, will restore at join time
1469  parent_team->t.t_serialized--;
1470  return TRUE;
1471  }
1472 
1473 #if OMPT_SUPPORT
1474  void *dummy;
1475  void **exit_frame_p;
1476 
1477  ompt_lw_taskteam_t lw_taskteam;
1478 
1479  if (ompt_enabled.enabled) {
1480  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1481  &ompt_parallel_data, return_address);
1482  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1483 
1484  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1485  // don't use lw_taskteam after linking. content was swaped
1486 
1487  /* OMPT implicit task begin */
1488  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1489  if (ompt_enabled.ompt_callback_implicit_task) {
1490  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1491  __kmp_tid_from_gtid(gtid);
1492  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1493  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1494  implicit_task_data, 1,
1495  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1496  }
1497 
1498  /* OMPT state */
1499  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1500  } else {
1501  exit_frame_p = &dummy;
1502  }
1503 #endif
1504  // AC: need to decrement t_serialized for enquiry functions to work
1505  // correctly, will restore at join time
1506  parent_team->t.t_serialized--;
1507 
1508  {
1509  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1510  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1511  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1512 #if OMPT_SUPPORT
1513  ,
1514  exit_frame_p
1515 #endif
1516  );
1517  }
1518 
1519 #if OMPT_SUPPORT
1520  if (ompt_enabled.enabled) {
1521  *exit_frame_p = NULL;
1522  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1523  if (ompt_enabled.ompt_callback_implicit_task) {
1524  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1525  ompt_scope_end, NULL, implicit_task_data, 1,
1526  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1527  }
1528  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1529  __ompt_lw_taskteam_unlink(master_th);
1530  if (ompt_enabled.ompt_callback_parallel_end) {
1531  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1532  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1533  OMPT_INVOKER(call_context) | ompt_parallel_team,
1534  return_address);
1535  }
1536  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1537  }
1538 #endif
1539  return TRUE;
1540  }
1541 
1542  parent_team->t.t_pkfn = microtask;
1543  parent_team->t.t_invoke = invoker;
1544  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1545  parent_team->t.t_active_level++;
1546  parent_team->t.t_level++;
1547  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1548 
1549 #if OMPT_SUPPORT
1550  if (ompt_enabled.enabled) {
1551  ompt_lw_taskteam_t lw_taskteam;
1552  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1553  &ompt_parallel_data, return_address);
1554  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1555  }
1556 #endif
1557 
1558  /* Change number of threads in the team if requested */
1559  if (master_set_numthreads) { // The parallel has num_threads clause
1560  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1561  // AC: only can reduce number of threads dynamically, can't increase
1562  kmp_info_t **other_threads = parent_team->t.t_threads;
1563  parent_team->t.t_nproc = master_set_numthreads;
1564  for (i = 0; i < master_set_numthreads; ++i) {
1565  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1566  }
1567  // Keep extra threads hot in the team for possible next parallels
1568  }
1569  master_th->th.th_set_nproc = 0;
1570  }
1571 
1572 #if USE_DEBUGGER
1573  if (__kmp_debugging) { // Let debugger override number of threads.
1574  int nth = __kmp_omp_num_threads(loc);
1575  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1576  master_set_numthreads = nth;
1577  }
1578  }
1579 #endif
1580 
1581 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1582  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1583  KMP_ITT_DEBUG) &&
1584  __kmp_forkjoin_frames_mode == 3 &&
1585  parent_team->t.t_active_level == 1 // only report frames at level 1
1586  && master_th->th.th_teams_size.nteams == 1) {
1587  kmp_uint64 tmp_time = __itt_get_timestamp();
1588  master_th->th.th_frame_time = tmp_time;
1589  parent_team->t.t_region_time = tmp_time;
1590  }
1591  if (__itt_stack_caller_create_ptr) {
1592  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1593  // create new stack stitching id before entering fork barrier
1594  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1595  }
1596 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1597 
1598  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1599  "master_th=%p, gtid=%d\n",
1600  root, parent_team, master_th, gtid));
1601  __kmp_internal_fork(loc, gtid, parent_team);
1602  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1603  "master_th=%p, gtid=%d\n",
1604  root, parent_team, master_th, gtid));
1605 
1606  if (call_context == fork_context_gnu)
1607  return TRUE;
1608 
1609  /* Invoke microtask for PRIMARY thread */
1610  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1611  parent_team->t.t_id, parent_team->t.t_pkfn));
1612 
1613  if (!parent_team->t.t_invoke(gtid)) {
1614  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1615  }
1616  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1617  parent_team->t.t_id, parent_team->t.t_pkfn));
1618  KMP_MB(); /* Flush all pending memory write invalidates. */
1619 
1620  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1621 
1622  return TRUE;
1623  } // Parallel closely nested in teams construct
1624 
1625 #if KMP_DEBUG
1626  if (__kmp_tasking_mode != tskm_immediate_exec) {
1627  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1628  parent_team->t.t_task_team[master_th->th.th_task_state]);
1629  }
1630 #endif
1631 
1632  int enter_teams = 0;
1633  if (parent_team->t.t_active_level >=
1634  master_th->th.th_current_task->td_icvs.max_active_levels) {
1635  nthreads = 1;
1636  } else {
1637  enter_teams = ((ap == NULL && active_level == 0) ||
1638  (ap && teams_level > 0 && teams_level == level));
1639  nthreads =
1640  master_set_numthreads
1641  ? master_set_numthreads
1642  : get__nproc_2(
1643  parent_team,
1644  master_tid); // TODO: get nproc directly from current task
1645 
1646  // Check if we need to take forkjoin lock? (no need for serialized
1647  // parallel out of teams construct). This code moved here from
1648  // __kmp_reserve_threads() to speedup nested serialized parallels.
1649  if (nthreads > 1) {
1650  if ((get__max_active_levels(master_th) == 1 &&
1651  (root->r.r_in_parallel && !enter_teams)) ||
1652  (__kmp_library == library_serial)) {
1653  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1654  " threads\n",
1655  gtid, nthreads));
1656  nthreads = 1;
1657  }
1658  }
1659  if (nthreads > 1) {
1660  /* determine how many new threads we can use */
1661  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1662  /* AC: If we execute teams from parallel region (on host), then teams
1663  should be created but each can only have 1 thread if nesting is
1664  disabled. If teams called from serial region, then teams and their
1665  threads should be created regardless of the nesting setting. */
1666  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1667  nthreads, enter_teams);
1668  if (nthreads == 1) {
1669  // Free lock for single thread execution here; for multi-thread
1670  // execution it will be freed later after team of threads created
1671  // and initialized
1672  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1673  }
1674  }
1675  }
1676  KMP_DEBUG_ASSERT(nthreads > 0);
1677 
1678  // If we temporarily changed the set number of threads then restore it now
1679  master_th->th.th_set_nproc = 0;
1680 
1681  /* create a serialized parallel region? */
1682  if (nthreads == 1) {
1683 /* josh todo: hypothetical question: what do we do for OS X*? */
1684 #if KMP_OS_LINUX && \
1685  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1686  void *args[argc];
1687 #else
1688  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1689 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1690  KMP_ARCH_AARCH64) */
1691 
1692  KA_TRACE(20,
1693  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1694 
1695  __kmpc_serialized_parallel(loc, gtid);
1696 
1697  if (call_context == fork_context_intel) {
1698  /* TODO this sucks, use the compiler itself to pass args! :) */
1699  master_th->th.th_serial_team->t.t_ident = loc;
1700  if (!ap) {
1701  // revert change made in __kmpc_serialized_parallel()
1702  master_th->th.th_serial_team->t.t_level--;
1703  // Get args from parent team for teams construct
1704 
1705 #if OMPT_SUPPORT
1706  void *dummy;
1707  void **exit_frame_p;
1708  ompt_task_info_t *task_info;
1709 
1710  ompt_lw_taskteam_t lw_taskteam;
1711 
1712  if (ompt_enabled.enabled) {
1713  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1714  &ompt_parallel_data, return_address);
1715 
1716  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1717  // don't use lw_taskteam after linking. content was swaped
1718 
1719  task_info = OMPT_CUR_TASK_INFO(master_th);
1720  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1721  if (ompt_enabled.ompt_callback_implicit_task) {
1722  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1723  __kmp_tid_from_gtid(gtid);
1724  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1726  &(task_info->task_data), 1,
1727  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1728  ompt_task_implicit);
1729  }
1730 
1731  /* OMPT state */
1732  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1733  } else {
1734  exit_frame_p = &dummy;
1735  }
1736 #endif
1737 
1738  {
1739  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1740  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1741  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1742  parent_team->t.t_argv
1743 #if OMPT_SUPPORT
1744  ,
1745  exit_frame_p
1746 #endif
1747  );
1748  }
1749 
1750 #if OMPT_SUPPORT
1751  if (ompt_enabled.enabled) {
1752  *exit_frame_p = NULL;
1753  if (ompt_enabled.ompt_callback_implicit_task) {
1754  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1755  ompt_scope_end, NULL, &(task_info->task_data), 1,
1756  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1757  ompt_task_implicit);
1758  }
1759  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1760  __ompt_lw_taskteam_unlink(master_th);
1761  if (ompt_enabled.ompt_callback_parallel_end) {
1762  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1763  &ompt_parallel_data, parent_task_data,
1764  OMPT_INVOKER(call_context) | ompt_parallel_team,
1765  return_address);
1766  }
1767  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1768  }
1769 #endif
1770  } else if (microtask == (microtask_t)__kmp_teams_master) {
1771  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1772  master_th->th.th_serial_team);
1773  team = master_th->th.th_team;
1774  // team->t.t_pkfn = microtask;
1775  team->t.t_invoke = invoker;
1776  __kmp_alloc_argv_entries(argc, team, TRUE);
1777  team->t.t_argc = argc;
1778  argv = (void **)team->t.t_argv;
1779  if (ap) {
1780  for (i = argc - 1; i >= 0; --i)
1781  *argv++ = va_arg(kmp_va_deref(ap), void *);
1782  } else {
1783  for (i = 0; i < argc; ++i)
1784  // Get args from parent team for teams construct
1785  argv[i] = parent_team->t.t_argv[i];
1786  }
1787  // AC: revert change made in __kmpc_serialized_parallel()
1788  // because initial code in teams should have level=0
1789  team->t.t_level--;
1790  // AC: call special invoker for outer "parallel" of teams construct
1791  invoker(gtid);
1792 #if OMPT_SUPPORT
1793  if (ompt_enabled.enabled) {
1794  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1795  if (ompt_enabled.ompt_callback_implicit_task) {
1796  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1797  ompt_scope_end, NULL, &(task_info->task_data), 0,
1798  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1799  }
1800  if (ompt_enabled.ompt_callback_parallel_end) {
1801  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1802  &ompt_parallel_data, parent_task_data,
1803  OMPT_INVOKER(call_context) | ompt_parallel_league,
1804  return_address);
1805  }
1806  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1807  }
1808 #endif
1809  } else {
1810  argv = args;
1811  for (i = argc - 1; i >= 0; --i)
1812  *argv++ = va_arg(kmp_va_deref(ap), void *);
1813  KMP_MB();
1814 
1815 #if OMPT_SUPPORT
1816  void *dummy;
1817  void **exit_frame_p;
1818  ompt_task_info_t *task_info;
1819 
1820  ompt_lw_taskteam_t lw_taskteam;
1821 
1822  if (ompt_enabled.enabled) {
1823  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1824  &ompt_parallel_data, return_address);
1825  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1826  // don't use lw_taskteam after linking. content was swaped
1827  task_info = OMPT_CUR_TASK_INFO(master_th);
1828  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1829 
1830  /* OMPT implicit task begin */
1831  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1832  if (ompt_enabled.ompt_callback_implicit_task) {
1833  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1834  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1835  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1836  ompt_task_implicit);
1837  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1838  __kmp_tid_from_gtid(gtid);
1839  }
1840 
1841  /* OMPT state */
1842  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1843  } else {
1844  exit_frame_p = &dummy;
1845  }
1846 #endif
1847 
1848  {
1849  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1850  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1851  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1852 #if OMPT_SUPPORT
1853  ,
1854  exit_frame_p
1855 #endif
1856  );
1857  }
1858 
1859 #if OMPT_SUPPORT
1860  if (ompt_enabled.enabled) {
1861  *exit_frame_p = NULL;
1862  if (ompt_enabled.ompt_callback_implicit_task) {
1863  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864  ompt_scope_end, NULL, &(task_info->task_data), 1,
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1866  ompt_task_implicit);
1867  }
1868 
1869  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1870  __ompt_lw_taskteam_unlink(master_th);
1871  if (ompt_enabled.ompt_callback_parallel_end) {
1872  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1873  &ompt_parallel_data, parent_task_data,
1874  OMPT_INVOKER(call_context) | ompt_parallel_team,
1875  return_address);
1876  }
1877  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1878  }
1879 #endif
1880  }
1881  } else if (call_context == fork_context_gnu) {
1882 #if OMPT_SUPPORT
1883  ompt_lw_taskteam_t lwt;
1884  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1885  return_address);
1886 
1887  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1888  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1889 // don't use lw_taskteam after linking. content was swaped
1890 #endif
1891 
1892  // we were called from GNU native code
1893  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1894  return FALSE;
1895  } else {
1896  KMP_ASSERT2(call_context < fork_context_last,
1897  "__kmp_fork_call: unknown fork_context parameter");
1898  }
1899 
1900  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1901  KMP_MB();
1902  return FALSE;
1903  } // if (nthreads == 1)
1904 
1905  // GEH: only modify the executing flag in the case when not serialized
1906  // serialized case is handled in kmpc_serialized_parallel
1907  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1908  "curtask=%p, curtask_max_aclevel=%d\n",
1909  parent_team->t.t_active_level, master_th,
1910  master_th->th.th_current_task,
1911  master_th->th.th_current_task->td_icvs.max_active_levels));
1912  // TODO: GEH - cannot do this assertion because root thread not set up as
1913  // executing
1914  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1915  master_th->th.th_current_task->td_flags.executing = 0;
1916 
1917  if (!master_th->th.th_teams_microtask || level > teams_level) {
1918  /* Increment our nested depth level */
1919  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1920  }
1921 
1922  // See if we need to make a copy of the ICVs.
1923  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1924  if ((level + 1 < __kmp_nested_nth.used) &&
1925  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1926  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1927  } else {
1928  nthreads_icv = 0; // don't update
1929  }
1930 
1931  // Figure out the proc_bind_policy for the new team.
1932  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1933  kmp_proc_bind_t proc_bind_icv =
1934  proc_bind_default; // proc_bind_default means don't update
1935  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1936  proc_bind = proc_bind_false;
1937  } else {
1938  if (proc_bind == proc_bind_default) {
1939  // No proc_bind clause specified; use current proc-bind-var for this
1940  // parallel region
1941  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1942  }
1943  /* else: The proc_bind policy was specified explicitly on parallel clause.
1944  This overrides proc-bind-var for this parallel region, but does not
1945  change proc-bind-var. */
1946  // Figure the value of proc-bind-var for the child threads.
1947  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1948  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1949  master_th->th.th_current_task->td_icvs.proc_bind)) {
1950  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1951  }
1952  }
1953 
1954  // Reset for next parallel region
1955  master_th->th.th_set_proc_bind = proc_bind_default;
1956 
1957  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
1958  kmp_internal_control_t new_icvs;
1959  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1960  new_icvs.next = NULL;
1961  if (nthreads_icv > 0) {
1962  new_icvs.nproc = nthreads_icv;
1963  }
1964  if (proc_bind_icv != proc_bind_default) {
1965  new_icvs.proc_bind = proc_bind_icv;
1966  }
1967 
1968  /* allocate a new parallel team */
1969  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1970  team = __kmp_allocate_team(root, nthreads, nthreads,
1971 #if OMPT_SUPPORT
1972  ompt_parallel_data,
1973 #endif
1974  proc_bind, &new_icvs,
1975  argc USE_NESTED_HOT_ARG(master_th));
1976  } else {
1977  /* allocate a new parallel team */
1978  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
1979  team = __kmp_allocate_team(root, nthreads, nthreads,
1980 #if OMPT_SUPPORT
1981  ompt_parallel_data,
1982 #endif
1983  proc_bind,
1984  &master_th->th.th_current_task->td_icvs,
1985  argc USE_NESTED_HOT_ARG(master_th));
1986  }
1987  KF_TRACE(
1988  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
1989 
1990  /* setup the new team */
1991  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
1992  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
1993  KMP_CHECK_UPDATE(team->t.t_ident, loc);
1994  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
1995  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
1996 #if OMPT_SUPPORT
1997  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
1998  return_address);
1999 #endif
2000  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2001  // TODO: parent_team->t.t_level == INT_MAX ???
2002  if (!master_th->th.th_teams_microtask || level > teams_level) {
2003  int new_level = parent_team->t.t_level + 1;
2004  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2005  new_level = parent_team->t.t_active_level + 1;
2006  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2007  } else {
2008  // AC: Do not increase parallel level at start of the teams construct
2009  int new_level = parent_team->t.t_level;
2010  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2011  new_level = parent_team->t.t_active_level;
2012  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2013  }
2014  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2015  // set primary thread's schedule as new run-time schedule
2016  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2017 
2018  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2019  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2020 
2021  // Update the floating point rounding in the team if required.
2022  propagateFPControl(team);
2023 
2024  if (__kmp_tasking_mode != tskm_immediate_exec) {
2025  // Set primary thread's task team to team's task team. Unless this is hot
2026  // team, it should be NULL.
2027  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2028  parent_team->t.t_task_team[master_th->th.th_task_state]);
2029  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2030  "%p, new task_team %p / team %p\n",
2031  __kmp_gtid_from_thread(master_th),
2032  master_th->th.th_task_team, parent_team,
2033  team->t.t_task_team[master_th->th.th_task_state], team));
2034 
2035  if (active_level || master_th->th.th_task_team) {
2036  // Take a memo of primary thread's task_state
2037  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2038  if (master_th->th.th_task_state_top >=
2039  master_th->th.th_task_state_stack_sz) { // increase size
2040  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2041  kmp_uint8 *old_stack, *new_stack;
2042  kmp_uint32 i;
2043  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2044  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2045  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2046  }
2047  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2048  ++i) { // zero-init rest of stack
2049  new_stack[i] = 0;
2050  }
2051  old_stack = master_th->th.th_task_state_memo_stack;
2052  master_th->th.th_task_state_memo_stack = new_stack;
2053  master_th->th.th_task_state_stack_sz = new_size;
2054  __kmp_free(old_stack);
2055  }
2056  // Store primary thread's task_state on stack
2057  master_th->th
2058  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2059  master_th->th.th_task_state;
2060  master_th->th.th_task_state_top++;
2061 #if KMP_NESTED_HOT_TEAMS
2062  if (master_th->th.th_hot_teams &&
2063  active_level < __kmp_hot_teams_max_level &&
2064  team == master_th->th.th_hot_teams[active_level].hot_team) {
2065  // Restore primary thread's nested state if nested hot team
2066  master_th->th.th_task_state =
2067  master_th->th
2068  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2069  } else {
2070 #endif
2071  master_th->th.th_task_state = 0;
2072 #if KMP_NESTED_HOT_TEAMS
2073  }
2074 #endif
2075  }
2076 #if !KMP_NESTED_HOT_TEAMS
2077  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2078  (team == root->r.r_hot_team));
2079 #endif
2080  }
2081 
2082  KA_TRACE(
2083  20,
2084  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2085  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2086  team->t.t_nproc));
2087  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2088  (team->t.t_master_tid == 0 &&
2089  (team->t.t_parent == root->r.r_root_team ||
2090  team->t.t_parent->t.t_serialized)));
2091  KMP_MB();
2092 
2093  /* now, setup the arguments */
2094  argv = (void **)team->t.t_argv;
2095  if (ap) {
2096  for (i = argc - 1; i >= 0; --i) {
2097  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2098  KMP_CHECK_UPDATE(*argv, new_argv);
2099  argv++;
2100  }
2101  } else {
2102  for (i = 0; i < argc; ++i) {
2103  // Get args from parent team for teams construct
2104  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2105  }
2106  }
2107 
2108  /* now actually fork the threads */
2109  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2110  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2111  root->r.r_active = TRUE;
2112 
2113  __kmp_fork_team_threads(root, team, master_th, gtid);
2114  __kmp_setup_icv_copy(team, nthreads,
2115  &master_th->th.th_current_task->td_icvs, loc);
2116 
2117 #if OMPT_SUPPORT
2118  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2119 #endif
2120 
2121  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2122 
2123 #if USE_ITT_BUILD
2124  if (team->t.t_active_level == 1 // only report frames at level 1
2125  && !master_th->th.th_teams_microtask) { // not in teams construct
2126 #if USE_ITT_NOTIFY
2127  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2128  (__kmp_forkjoin_frames_mode == 3 ||
2129  __kmp_forkjoin_frames_mode == 1)) {
2130  kmp_uint64 tmp_time = 0;
2131  if (__itt_get_timestamp_ptr)
2132  tmp_time = __itt_get_timestamp();
2133  // Internal fork - report frame begin
2134  master_th->th.th_frame_time = tmp_time;
2135  if (__kmp_forkjoin_frames_mode == 3)
2136  team->t.t_region_time = tmp_time;
2137  } else
2138 // only one notification scheme (either "submit" or "forking/joined", not both)
2139 #endif /* USE_ITT_NOTIFY */
2140  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2141  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2142  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2143  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2144  }
2145  }
2146 #endif /* USE_ITT_BUILD */
2147 
2148  /* now go on and do the work */
2149  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2150  KMP_MB();
2151  KF_TRACE(10,
2152  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2153  root, team, master_th, gtid));
2154 
2155 #if USE_ITT_BUILD
2156  if (__itt_stack_caller_create_ptr) {
2157  // create new stack stitching id before entering fork barrier
2158  if (!enter_teams) {
2159  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2160  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2161  } else if (parent_team->t.t_serialized) {
2162  // keep stack stitching id in the serialized parent_team;
2163  // current team will be used for parallel inside the teams;
2164  // if parent_team is active, then it already keeps stack stitching id
2165  // for the league of teams
2166  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2167  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2168  }
2169  }
2170 #endif /* USE_ITT_BUILD */
2171 
2172  // AC: skip __kmp_internal_fork at teams construct, let only primary
2173  // threads execute
2174  if (ap) {
2175  __kmp_internal_fork(loc, gtid, team);
2176  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2177  "master_th=%p, gtid=%d\n",
2178  root, team, master_th, gtid));
2179  }
2180 
2181  if (call_context == fork_context_gnu) {
2182  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2183  return TRUE;
2184  }
2185 
2186  /* Invoke microtask for PRIMARY thread */
2187  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2188  team->t.t_id, team->t.t_pkfn));
2189  } // END of timer KMP_fork_call block
2190 
2191 #if KMP_STATS_ENABLED
2192  // If beginning a teams construct, then change thread state
2193  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2194  if (!ap) {
2195  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2196  }
2197 #endif
2198 
2199  if (!team->t.t_invoke(gtid)) {
2200  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2201  }
2202 
2203 #if KMP_STATS_ENABLED
2204  // If was beginning of a teams construct, then reset thread state
2205  if (!ap) {
2206  KMP_SET_THREAD_STATE(previous_state);
2207  }
2208 #endif
2209 
2210  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2211  team->t.t_id, team->t.t_pkfn));
2212  KMP_MB(); /* Flush all pending memory write invalidates. */
2213 
2214  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2215 
2216 #if OMPT_SUPPORT
2217  if (ompt_enabled.enabled) {
2218  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2219  }
2220 #endif
2221 
2222  return TRUE;
2223 }
2224 
2225 #if OMPT_SUPPORT
2226 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2227  kmp_team_t *team) {
2228  // restore state outside the region
2229  thread->th.ompt_thread_info.state =
2230  ((team->t.t_serialized) ? ompt_state_work_serial
2231  : ompt_state_work_parallel);
2232 }
2233 
2234 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2235  kmp_team_t *team, ompt_data_t *parallel_data,
2236  int flags, void *codeptr) {
2237  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2238  if (ompt_enabled.ompt_callback_parallel_end) {
2239  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2240  parallel_data, &(task_info->task_data), flags, codeptr);
2241  }
2242 
2243  task_info->frame.enter_frame = ompt_data_none;
2244  __kmp_join_restore_state(thread, team);
2245 }
2246 #endif
2247 
2248 void __kmp_join_call(ident_t *loc, int gtid
2249 #if OMPT_SUPPORT
2250  ,
2251  enum fork_context_e fork_context
2252 #endif
2253  ,
2254  int exit_teams) {
2255  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2256  kmp_team_t *team;
2257  kmp_team_t *parent_team;
2258  kmp_info_t *master_th;
2259  kmp_root_t *root;
2260  int master_active;
2261 
2262  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2263 
2264  /* setup current data */
2265  master_th = __kmp_threads[gtid];
2266  root = master_th->th.th_root;
2267  team = master_th->th.th_team;
2268  parent_team = team->t.t_parent;
2269 
2270  master_th->th.th_ident = loc;
2271 
2272 #if OMPT_SUPPORT
2273  void *team_microtask = (void *)team->t.t_pkfn;
2274  // For GOMP interface with serialized parallel, need the
2275  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2276  // and end-parallel events.
2277  if (ompt_enabled.enabled &&
2278  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2279  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2280  }
2281 #endif
2282 
2283 #if KMP_DEBUG
2284  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2285  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2286  "th_task_team = %p\n",
2287  __kmp_gtid_from_thread(master_th), team,
2288  team->t.t_task_team[master_th->th.th_task_state],
2289  master_th->th.th_task_team));
2290  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2291  team->t.t_task_team[master_th->th.th_task_state]);
2292  }
2293 #endif
2294 
2295  if (team->t.t_serialized) {
2296  if (master_th->th.th_teams_microtask) {
2297  // We are in teams construct
2298  int level = team->t.t_level;
2299  int tlevel = master_th->th.th_teams_level;
2300  if (level == tlevel) {
2301  // AC: we haven't incremented it earlier at start of teams construct,
2302  // so do it here - at the end of teams construct
2303  team->t.t_level++;
2304  } else if (level == tlevel + 1) {
2305  // AC: we are exiting parallel inside teams, need to increment
2306  // serialization in order to restore it in the next call to
2307  // __kmpc_end_serialized_parallel
2308  team->t.t_serialized++;
2309  }
2310  }
2311  __kmpc_end_serialized_parallel(loc, gtid);
2312 
2313 #if OMPT_SUPPORT
2314  if (ompt_enabled.enabled) {
2315  __kmp_join_restore_state(master_th, parent_team);
2316  }
2317 #endif
2318 
2319  return;
2320  }
2321 
2322  master_active = team->t.t_master_active;
2323 
2324  if (!exit_teams) {
2325  // AC: No barrier for internal teams at exit from teams construct.
2326  // But there is barrier for external team (league).
2327  __kmp_internal_join(loc, gtid, team);
2328 #if USE_ITT_BUILD
2329  if (__itt_stack_caller_create_ptr) {
2330  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2331  // destroy the stack stitching id after join barrier
2332  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2333  team->t.t_stack_id = NULL;
2334  }
2335 #endif
2336  } else {
2337  master_th->th.th_task_state =
2338  0; // AC: no tasking in teams (out of any parallel)
2339 #if USE_ITT_BUILD
2340  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2341  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2342  // destroy the stack stitching id on exit from the teams construct
2343  // if parent_team is active, then the id will be destroyed later on
2344  // by master of the league of teams
2345  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2346  parent_team->t.t_stack_id = NULL;
2347  }
2348 #endif
2349  }
2350 
2351  KMP_MB();
2352 
2353 #if OMPT_SUPPORT
2354  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2355  void *codeptr = team->t.ompt_team_info.master_return_address;
2356 #endif
2357 
2358 #if USE_ITT_BUILD
2359  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2360  if (team->t.t_active_level == 1 &&
2361  (!master_th->th.th_teams_microtask || /* not in teams construct */
2362  master_th->th.th_teams_size.nteams == 1)) {
2363  master_th->th.th_ident = loc;
2364  // only one notification scheme (either "submit" or "forking/joined", not
2365  // both)
2366  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2367  __kmp_forkjoin_frames_mode == 3)
2368  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2369  master_th->th.th_frame_time, 0, loc,
2370  master_th->th.th_team_nproc, 1);
2371  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2372  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2373  __kmp_itt_region_joined(gtid);
2374  } // active_level == 1
2375 #endif /* USE_ITT_BUILD */
2376 
2377  if (master_th->th.th_teams_microtask && !exit_teams &&
2378  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2379  team->t.t_level == master_th->th.th_teams_level + 1) {
2380 // AC: We need to leave the team structure intact at the end of parallel
2381 // inside the teams construct, so that at the next parallel same (hot) team
2382 // works, only adjust nesting levels
2383 #if OMPT_SUPPORT
2384  ompt_data_t ompt_parallel_data = ompt_data_none;
2385  if (ompt_enabled.enabled) {
2386  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2387  if (ompt_enabled.ompt_callback_implicit_task) {
2388  int ompt_team_size = team->t.t_nproc;
2389  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2390  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2391  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2392  }
2393  task_info->frame.exit_frame = ompt_data_none;
2394  task_info->task_data = ompt_data_none;
2395  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2396  __ompt_lw_taskteam_unlink(master_th);
2397  }
2398 #endif
2399  /* Decrement our nested depth level */
2400  team->t.t_level--;
2401  team->t.t_active_level--;
2402  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2403 
2404  // Restore number of threads in the team if needed. This code relies on
2405  // the proper adjustment of th_teams_size.nth after the fork in
2406  // __kmp_teams_master on each teams primary thread in the case that
2407  // __kmp_reserve_threads reduced it.
2408  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2409  int old_num = master_th->th.th_team_nproc;
2410  int new_num = master_th->th.th_teams_size.nth;
2411  kmp_info_t **other_threads = team->t.t_threads;
2412  team->t.t_nproc = new_num;
2413  for (int i = 0; i < old_num; ++i) {
2414  other_threads[i]->th.th_team_nproc = new_num;
2415  }
2416  // Adjust states of non-used threads of the team
2417  for (int i = old_num; i < new_num; ++i) {
2418  // Re-initialize thread's barrier data.
2419  KMP_DEBUG_ASSERT(other_threads[i]);
2420  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2421  for (int b = 0; b < bs_last_barrier; ++b) {
2422  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2423  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2424 #if USE_DEBUGGER
2425  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2426 #endif
2427  }
2428  if (__kmp_tasking_mode != tskm_immediate_exec) {
2429  // Synchronize thread's task state
2430  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2431  }
2432  }
2433  }
2434 
2435 #if OMPT_SUPPORT
2436  if (ompt_enabled.enabled) {
2437  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2438  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2439  }
2440 #endif
2441 
2442  return;
2443  }
2444 
2445  /* do cleanup and restore the parent team */
2446  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2447  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2448 
2449  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2450 
2451  /* jc: The following lock has instructions with REL and ACQ semantics,
2452  separating the parallel user code called in this parallel region
2453  from the serial user code called after this function returns. */
2454  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2455 
2456  if (!master_th->th.th_teams_microtask ||
2457  team->t.t_level > master_th->th.th_teams_level) {
2458  /* Decrement our nested depth level */
2459  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2460  }
2461  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2462 
2463 #if OMPT_SUPPORT
2464  if (ompt_enabled.enabled) {
2465  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2466  if (ompt_enabled.ompt_callback_implicit_task) {
2467  int flags = (team_microtask == (void *)__kmp_teams_master)
2468  ? ompt_task_initial
2469  : ompt_task_implicit;
2470  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2471  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2472  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2473  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2474  }
2475  task_info->frame.exit_frame = ompt_data_none;
2476  task_info->task_data = ompt_data_none;
2477  }
2478 #endif
2479 
2480  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2481  master_th, team));
2482  __kmp_pop_current_task_from_thread(master_th);
2483 
2484 #if KMP_AFFINITY_SUPPORTED
2485  // Restore master thread's partition.
2486  master_th->th.th_first_place = team->t.t_first_place;
2487  master_th->th.th_last_place = team->t.t_last_place;
2488 #endif // KMP_AFFINITY_SUPPORTED
2489  master_th->th.th_def_allocator = team->t.t_def_allocator;
2490 
2491  updateHWFPControl(team);
2492 
2493  if (root->r.r_active != master_active)
2494  root->r.r_active = master_active;
2495 
2496  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2497  master_th)); // this will free worker threads
2498 
2499  /* this race was fun to find. make sure the following is in the critical
2500  region otherwise assertions may fail occasionally since the old team may be
2501  reallocated and the hierarchy appears inconsistent. it is actually safe to
2502  run and won't cause any bugs, but will cause those assertion failures. it's
2503  only one deref&assign so might as well put this in the critical region */
2504  master_th->th.th_team = parent_team;
2505  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2506  master_th->th.th_team_master = parent_team->t.t_threads[0];
2507  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2508 
2509  /* restore serialized team, if need be */
2510  if (parent_team->t.t_serialized &&
2511  parent_team != master_th->th.th_serial_team &&
2512  parent_team != root->r.r_root_team) {
2513  __kmp_free_team(root,
2514  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2515  master_th->th.th_serial_team = parent_team;
2516  }
2517 
2518  if (__kmp_tasking_mode != tskm_immediate_exec) {
2519  if (master_th->th.th_task_state_top >
2520  0) { // Restore task state from memo stack
2521  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2522  // Remember primary thread's state if we re-use this nested hot team
2523  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2524  master_th->th.th_task_state;
2525  --master_th->th.th_task_state_top; // pop
2526  // Now restore state at this level
2527  master_th->th.th_task_state =
2528  master_th->th
2529  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2530  }
2531  // Copy the task team from the parent team to the primary thread
2532  master_th->th.th_task_team =
2533  parent_team->t.t_task_team[master_th->th.th_task_state];
2534  KA_TRACE(20,
2535  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2536  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2537  parent_team));
2538  }
2539 
2540  // TODO: GEH - cannot do this assertion because root thread not set up as
2541  // executing
2542  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2543  master_th->th.th_current_task->td_flags.executing = 1;
2544 
2545  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2546 
2547 #if OMPT_SUPPORT
2548  int flags =
2549  OMPT_INVOKER(fork_context) |
2550  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2551  : ompt_parallel_team);
2552  if (ompt_enabled.enabled) {
2553  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2554  codeptr);
2555  }
2556 #endif
2557 
2558  KMP_MB();
2559  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2560 }
2561 
2562 /* Check whether we should push an internal control record onto the
2563  serial team stack. If so, do it. */
2564 void __kmp_save_internal_controls(kmp_info_t *thread) {
2565 
2566  if (thread->th.th_team != thread->th.th_serial_team) {
2567  return;
2568  }
2569  if (thread->th.th_team->t.t_serialized > 1) {
2570  int push = 0;
2571 
2572  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2573  push = 1;
2574  } else {
2575  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2576  thread->th.th_team->t.t_serialized) {
2577  push = 1;
2578  }
2579  }
2580  if (push) { /* push a record on the serial team's stack */
2581  kmp_internal_control_t *control =
2582  (kmp_internal_control_t *)__kmp_allocate(
2583  sizeof(kmp_internal_control_t));
2584 
2585  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2586 
2587  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2588 
2589  control->next = thread->th.th_team->t.t_control_stack_top;
2590  thread->th.th_team->t.t_control_stack_top = control;
2591  }
2592  }
2593 }
2594 
2595 /* Changes set_nproc */
2596 void __kmp_set_num_threads(int new_nth, int gtid) {
2597  kmp_info_t *thread;
2598  kmp_root_t *root;
2599 
2600  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2601  KMP_DEBUG_ASSERT(__kmp_init_serial);
2602 
2603  if (new_nth < 1)
2604  new_nth = 1;
2605  else if (new_nth > __kmp_max_nth)
2606  new_nth = __kmp_max_nth;
2607 
2608  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2609  thread = __kmp_threads[gtid];
2610  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2611  return; // nothing to do
2612 
2613  __kmp_save_internal_controls(thread);
2614 
2615  set__nproc(thread, new_nth);
2616 
2617  // If this omp_set_num_threads() call will cause the hot team size to be
2618  // reduced (in the absence of a num_threads clause), then reduce it now,
2619  // rather than waiting for the next parallel region.
2620  root = thread->th.th_root;
2621  if (__kmp_init_parallel && (!root->r.r_active) &&
2622  (root->r.r_hot_team->t.t_nproc > new_nth)
2623 #if KMP_NESTED_HOT_TEAMS
2624  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2625 #endif
2626  ) {
2627  kmp_team_t *hot_team = root->r.r_hot_team;
2628  int f;
2629 
2630  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2631 
2632  // Release the extra threads we don't need any more.
2633  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2634  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2635  if (__kmp_tasking_mode != tskm_immediate_exec) {
2636  // When decreasing team size, threads no longer in the team should unref
2637  // task team.
2638  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2639  }
2640  __kmp_free_thread(hot_team->t.t_threads[f]);
2641  hot_team->t.t_threads[f] = NULL;
2642  }
2643  hot_team->t.t_nproc = new_nth;
2644 #if KMP_NESTED_HOT_TEAMS
2645  if (thread->th.th_hot_teams) {
2646  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2647  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2648  }
2649 #endif
2650 
2651  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2652 
2653  // Update the t_nproc field in the threads that are still active.
2654  for (f = 0; f < new_nth; f++) {
2655  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2656  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2657  }
2658  // Special flag in case omp_set_num_threads() call
2659  hot_team->t.t_size_changed = -1;
2660  }
2661 }
2662 
2663 /* Changes max_active_levels */
2664 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2665  kmp_info_t *thread;
2666 
2667  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2668  "%d = (%d)\n",
2669  gtid, max_active_levels));
2670  KMP_DEBUG_ASSERT(__kmp_init_serial);
2671 
2672  // validate max_active_levels
2673  if (max_active_levels < 0) {
2674  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2675  // We ignore this call if the user has specified a negative value.
2676  // The current setting won't be changed. The last valid setting will be
2677  // used. A warning will be issued (if warnings are allowed as controlled by
2678  // the KMP_WARNINGS env var).
2679  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2680  "max_active_levels for thread %d = (%d)\n",
2681  gtid, max_active_levels));
2682  return;
2683  }
2684  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2685  // it's OK, the max_active_levels is within the valid range: [ 0;
2686  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2687  // We allow a zero value. (implementation defined behavior)
2688  } else {
2689  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2690  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2691  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2692  // Current upper limit is MAX_INT. (implementation defined behavior)
2693  // If the input exceeds the upper limit, we correct the input to be the
2694  // upper limit. (implementation defined behavior)
2695  // Actually, the flow should never get here until we use MAX_INT limit.
2696  }
2697  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2698  "max_active_levels for thread %d = (%d)\n",
2699  gtid, max_active_levels));
2700 
2701  thread = __kmp_threads[gtid];
2702 
2703  __kmp_save_internal_controls(thread);
2704 
2705  set__max_active_levels(thread, max_active_levels);
2706 }
2707 
2708 /* Gets max_active_levels */
2709 int __kmp_get_max_active_levels(int gtid) {
2710  kmp_info_t *thread;
2711 
2712  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2713  KMP_DEBUG_ASSERT(__kmp_init_serial);
2714 
2715  thread = __kmp_threads[gtid];
2716  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2717  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2718  "curtask_maxaclevel=%d\n",
2719  gtid, thread->th.th_current_task,
2720  thread->th.th_current_task->td_icvs.max_active_levels));
2721  return thread->th.th_current_task->td_icvs.max_active_levels;
2722 }
2723 
2724 // nteams-var per-device ICV
2725 void __kmp_set_num_teams(int num_teams) {
2726  if (num_teams > 0)
2727  __kmp_nteams = num_teams;
2728 }
2729 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2730 // teams-thread-limit-var per-device ICV
2731 void __kmp_set_teams_thread_limit(int limit) {
2732  if (limit > 0)
2733  __kmp_teams_thread_limit = limit;
2734 }
2735 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2736 
2737 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2738 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2739 
2740 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2741 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2742  kmp_info_t *thread;
2743  kmp_sched_t orig_kind;
2744  // kmp_team_t *team;
2745 
2746  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2747  gtid, (int)kind, chunk));
2748  KMP_DEBUG_ASSERT(__kmp_init_serial);
2749 
2750  // Check if the kind parameter is valid, correct if needed.
2751  // Valid parameters should fit in one of two intervals - standard or extended:
2752  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2753  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2754  orig_kind = kind;
2755  kind = __kmp_sched_without_mods(kind);
2756 
2757  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2758  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2759  // TODO: Hint needs attention in case we change the default schedule.
2760  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2761  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2762  __kmp_msg_null);
2763  kind = kmp_sched_default;
2764  chunk = 0; // ignore chunk value in case of bad kind
2765  }
2766 
2767  thread = __kmp_threads[gtid];
2768 
2769  __kmp_save_internal_controls(thread);
2770 
2771  if (kind < kmp_sched_upper_std) {
2772  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2773  // differ static chunked vs. unchunked: chunk should be invalid to
2774  // indicate unchunked schedule (which is the default)
2775  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2776  } else {
2777  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2778  __kmp_sch_map[kind - kmp_sched_lower - 1];
2779  }
2780  } else {
2781  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2782  // kmp_sched_lower - 2 ];
2783  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2784  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2785  kmp_sched_lower - 2];
2786  }
2787  __kmp_sched_apply_mods_intkind(
2788  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2789  if (kind == kmp_sched_auto || chunk < 1) {
2790  // ignore parameter chunk for schedule auto
2791  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2792  } else {
2793  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2794  }
2795 }
2796 
2797 /* Gets def_sched_var ICV values */
2798 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2799  kmp_info_t *thread;
2800  enum sched_type th_type;
2801 
2802  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2803  KMP_DEBUG_ASSERT(__kmp_init_serial);
2804 
2805  thread = __kmp_threads[gtid];
2806 
2807  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2808  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2809  case kmp_sch_static:
2810  case kmp_sch_static_greedy:
2811  case kmp_sch_static_balanced:
2812  *kind = kmp_sched_static;
2813  __kmp_sched_apply_mods_stdkind(kind, th_type);
2814  *chunk = 0; // chunk was not set, try to show this fact via zero value
2815  return;
2816  case kmp_sch_static_chunked:
2817  *kind = kmp_sched_static;
2818  break;
2819  case kmp_sch_dynamic_chunked:
2820  *kind = kmp_sched_dynamic;
2821  break;
2823  case kmp_sch_guided_iterative_chunked:
2824  case kmp_sch_guided_analytical_chunked:
2825  *kind = kmp_sched_guided;
2826  break;
2827  case kmp_sch_auto:
2828  *kind = kmp_sched_auto;
2829  break;
2830  case kmp_sch_trapezoidal:
2831  *kind = kmp_sched_trapezoidal;
2832  break;
2833 #if KMP_STATIC_STEAL_ENABLED
2834  case kmp_sch_static_steal:
2835  *kind = kmp_sched_static_steal;
2836  break;
2837 #endif
2838  default:
2839  KMP_FATAL(UnknownSchedulingType, th_type);
2840  }
2841 
2842  __kmp_sched_apply_mods_stdkind(kind, th_type);
2843  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2844 }
2845 
2846 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2847 
2848  int ii, dd;
2849  kmp_team_t *team;
2850  kmp_info_t *thr;
2851 
2852  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2853  KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855  // validate level
2856  if (level == 0)
2857  return 0;
2858  if (level < 0)
2859  return -1;
2860  thr = __kmp_threads[gtid];
2861  team = thr->th.th_team;
2862  ii = team->t.t_level;
2863  if (level > ii)
2864  return -1;
2865 
2866  if (thr->th.th_teams_microtask) {
2867  // AC: we are in teams region where multiple nested teams have same level
2868  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2869  if (level <=
2870  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2871  KMP_DEBUG_ASSERT(ii >= tlevel);
2872  // AC: As we need to pass by the teams league, we need to artificially
2873  // increase ii
2874  if (ii == tlevel) {
2875  ii += 2; // three teams have same level
2876  } else {
2877  ii++; // two teams have same level
2878  }
2879  }
2880  }
2881 
2882  if (ii == level)
2883  return __kmp_tid_from_gtid(gtid);
2884 
2885  dd = team->t.t_serialized;
2886  level++;
2887  while (ii > level) {
2888  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2889  }
2890  if ((team->t.t_serialized) && (!dd)) {
2891  team = team->t.t_parent;
2892  continue;
2893  }
2894  if (ii > level) {
2895  team = team->t.t_parent;
2896  dd = team->t.t_serialized;
2897  ii--;
2898  }
2899  }
2900 
2901  return (dd > 1) ? (0) : (team->t.t_master_tid);
2902 }
2903 
2904 int __kmp_get_team_size(int gtid, int level) {
2905 
2906  int ii, dd;
2907  kmp_team_t *team;
2908  kmp_info_t *thr;
2909 
2910  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2911  KMP_DEBUG_ASSERT(__kmp_init_serial);
2912 
2913  // validate level
2914  if (level == 0)
2915  return 1;
2916  if (level < 0)
2917  return -1;
2918  thr = __kmp_threads[gtid];
2919  team = thr->th.th_team;
2920  ii = team->t.t_level;
2921  if (level > ii)
2922  return -1;
2923 
2924  if (thr->th.th_teams_microtask) {
2925  // AC: we are in teams region where multiple nested teams have same level
2926  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2927  if (level <=
2928  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2929  KMP_DEBUG_ASSERT(ii >= tlevel);
2930  // AC: As we need to pass by the teams league, we need to artificially
2931  // increase ii
2932  if (ii == tlevel) {
2933  ii += 2; // three teams have same level
2934  } else {
2935  ii++; // two teams have same level
2936  }
2937  }
2938  }
2939 
2940  while (ii > level) {
2941  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2942  }
2943  if (team->t.t_serialized && (!dd)) {
2944  team = team->t.t_parent;
2945  continue;
2946  }
2947  if (ii > level) {
2948  team = team->t.t_parent;
2949  ii--;
2950  }
2951  }
2952 
2953  return team->t.t_nproc;
2954 }
2955 
2956 kmp_r_sched_t __kmp_get_schedule_global() {
2957  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2958  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2959  // independently. So one can get the updated schedule here.
2960 
2961  kmp_r_sched_t r_sched;
2962 
2963  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2964  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2965  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2966  // different roots (even in OMP 2.5)
2967  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2968  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2969  if (s == kmp_sch_static) {
2970  // replace STATIC with more detailed schedule (balanced or greedy)
2971  r_sched.r_sched_type = __kmp_static;
2972  } else if (s == kmp_sch_guided_chunked) {
2973  // replace GUIDED with more detailed schedule (iterative or analytical)
2974  r_sched.r_sched_type = __kmp_guided;
2975  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2976  r_sched.r_sched_type = __kmp_sched;
2977  }
2978  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2979 
2980  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2981  // __kmp_chunk may be wrong here (if it was not ever set)
2982  r_sched.chunk = KMP_DEFAULT_CHUNK;
2983  } else {
2984  r_sched.chunk = __kmp_chunk;
2985  }
2986 
2987  return r_sched;
2988 }
2989 
2990 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2991  at least argc number of *t_argv entries for the requested team. */
2992 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
2993 
2994  KMP_DEBUG_ASSERT(team);
2995  if (!realloc || argc > team->t.t_max_argc) {
2996 
2997  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
2998  "current entries=%d\n",
2999  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3000  /* if previously allocated heap space for args, free them */
3001  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3002  __kmp_free((void *)team->t.t_argv);
3003 
3004  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3005  /* use unused space in the cache line for arguments */
3006  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3007  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3008  "argv entries\n",
3009  team->t.t_id, team->t.t_max_argc));
3010  team->t.t_argv = &team->t.t_inline_argv[0];
3011  if (__kmp_storage_map) {
3012  __kmp_print_storage_map_gtid(
3013  -1, &team->t.t_inline_argv[0],
3014  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3015  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3016  team->t.t_id);
3017  }
3018  } else {
3019  /* allocate space for arguments in the heap */
3020  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3021  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3022  : 2 * argc;
3023  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3024  "argv entries\n",
3025  team->t.t_id, team->t.t_max_argc));
3026  team->t.t_argv =
3027  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3028  if (__kmp_storage_map) {
3029  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3030  &team->t.t_argv[team->t.t_max_argc],
3031  sizeof(void *) * team->t.t_max_argc,
3032  "team_%d.t_argv", team->t.t_id);
3033  }
3034  }
3035  }
3036 }
3037 
3038 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3039  int i;
3040  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3041  team->t.t_threads =
3042  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3043  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3044  sizeof(dispatch_shared_info_t) * num_disp_buff);
3045  team->t.t_dispatch =
3046  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3047  team->t.t_implicit_task_taskdata =
3048  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3049  team->t.t_max_nproc = max_nth;
3050 
3051  /* setup dispatch buffers */
3052  for (i = 0; i < num_disp_buff; ++i) {
3053  team->t.t_disp_buffer[i].buffer_index = i;
3054  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3055  }
3056 }
3057 
3058 static void __kmp_free_team_arrays(kmp_team_t *team) {
3059  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3060  int i;
3061  for (i = 0; i < team->t.t_max_nproc; ++i) {
3062  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3063  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3064  team->t.t_dispatch[i].th_disp_buffer = NULL;
3065  }
3066  }
3067 #if KMP_USE_HIER_SCHED
3068  __kmp_dispatch_free_hierarchies(team);
3069 #endif
3070  __kmp_free(team->t.t_threads);
3071  __kmp_free(team->t.t_disp_buffer);
3072  __kmp_free(team->t.t_dispatch);
3073  __kmp_free(team->t.t_implicit_task_taskdata);
3074  team->t.t_threads = NULL;
3075  team->t.t_disp_buffer = NULL;
3076  team->t.t_dispatch = NULL;
3077  team->t.t_implicit_task_taskdata = 0;
3078 }
3079 
3080 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3081  kmp_info_t **oldThreads = team->t.t_threads;
3082 
3083  __kmp_free(team->t.t_disp_buffer);
3084  __kmp_free(team->t.t_dispatch);
3085  __kmp_free(team->t.t_implicit_task_taskdata);
3086  __kmp_allocate_team_arrays(team, max_nth);
3087 
3088  KMP_MEMCPY(team->t.t_threads, oldThreads,
3089  team->t.t_nproc * sizeof(kmp_info_t *));
3090 
3091  __kmp_free(oldThreads);
3092 }
3093 
3094 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3095 
3096  kmp_r_sched_t r_sched =
3097  __kmp_get_schedule_global(); // get current state of scheduling globals
3098 
3099  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3100 
3101  kmp_internal_control_t g_icvs = {
3102  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3103  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3104  // adjustment of threads (per thread)
3105  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3106  // whether blocktime is explicitly set
3107  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3108 #if KMP_USE_MONITOR
3109  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3110 // intervals
3111 #endif
3112  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3113  // next parallel region (per thread)
3114  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3115  __kmp_cg_max_nth, // int thread_limit;
3116  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3117  // for max_active_levels
3118  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3119  // {sched,chunk} pair
3120  __kmp_nested_proc_bind.bind_types[0],
3121  __kmp_default_device,
3122  NULL // struct kmp_internal_control *next;
3123  };
3124 
3125  return g_icvs;
3126 }
3127 
3128 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3129 
3130  kmp_internal_control_t gx_icvs;
3131  gx_icvs.serial_nesting_level =
3132  0; // probably =team->t.t_serial like in save_inter_controls
3133  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3134  gx_icvs.next = NULL;
3135 
3136  return gx_icvs;
3137 }
3138 
3139 static void __kmp_initialize_root(kmp_root_t *root) {
3140  int f;
3141  kmp_team_t *root_team;
3142  kmp_team_t *hot_team;
3143  int hot_team_max_nth;
3144  kmp_r_sched_t r_sched =
3145  __kmp_get_schedule_global(); // get current state of scheduling globals
3146  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3147  KMP_DEBUG_ASSERT(root);
3148  KMP_ASSERT(!root->r.r_begin);
3149 
3150  /* setup the root state structure */
3151  __kmp_init_lock(&root->r.r_begin_lock);
3152  root->r.r_begin = FALSE;
3153  root->r.r_active = FALSE;
3154  root->r.r_in_parallel = 0;
3155  root->r.r_blocktime = __kmp_dflt_blocktime;
3156 
3157  /* setup the root team for this task */
3158  /* allocate the root team structure */
3159  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3160 
3161  root_team =
3162  __kmp_allocate_team(root,
3163  1, // new_nproc
3164  1, // max_nproc
3165 #if OMPT_SUPPORT
3166  ompt_data_none, // root parallel id
3167 #endif
3168  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3169  0 // argc
3170  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3171  );
3172 #if USE_DEBUGGER
3173  // Non-NULL value should be assigned to make the debugger display the root
3174  // team.
3175  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3176 #endif
3177 
3178  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3179 
3180  root->r.r_root_team = root_team;
3181  root_team->t.t_control_stack_top = NULL;
3182 
3183  /* initialize root team */
3184  root_team->t.t_threads[0] = NULL;
3185  root_team->t.t_nproc = 1;
3186  root_team->t.t_serialized = 1;
3187  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3188  root_team->t.t_sched.sched = r_sched.sched;
3189  KA_TRACE(
3190  20,
3191  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3192  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3193 
3194  /* setup the hot team for this task */
3195  /* allocate the hot team structure */
3196  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3197 
3198  hot_team =
3199  __kmp_allocate_team(root,
3200  1, // new_nproc
3201  __kmp_dflt_team_nth_ub * 2, // max_nproc
3202 #if OMPT_SUPPORT
3203  ompt_data_none, // root parallel id
3204 #endif
3205  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3206  0 // argc
3207  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3208  );
3209  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3210 
3211  root->r.r_hot_team = hot_team;
3212  root_team->t.t_control_stack_top = NULL;
3213 
3214  /* first-time initialization */
3215  hot_team->t.t_parent = root_team;
3216 
3217  /* initialize hot team */
3218  hot_team_max_nth = hot_team->t.t_max_nproc;
3219  for (f = 0; f < hot_team_max_nth; ++f) {
3220  hot_team->t.t_threads[f] = NULL;
3221  }
3222  hot_team->t.t_nproc = 1;
3223  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3224  hot_team->t.t_sched.sched = r_sched.sched;
3225  hot_team->t.t_size_changed = 0;
3226 }
3227 
3228 #ifdef KMP_DEBUG
3229 
3230 typedef struct kmp_team_list_item {
3231  kmp_team_p const *entry;
3232  struct kmp_team_list_item *next;
3233 } kmp_team_list_item_t;
3234 typedef kmp_team_list_item_t *kmp_team_list_t;
3235 
3236 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3237  kmp_team_list_t list, // List of teams.
3238  kmp_team_p const *team // Team to add.
3239 ) {
3240 
3241  // List must terminate with item where both entry and next are NULL.
3242  // Team is added to the list only once.
3243  // List is sorted in ascending order by team id.
3244  // Team id is *not* a key.
3245 
3246  kmp_team_list_t l;
3247 
3248  KMP_DEBUG_ASSERT(list != NULL);
3249  if (team == NULL) {
3250  return;
3251  }
3252 
3253  __kmp_print_structure_team_accum(list, team->t.t_parent);
3254  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3255 
3256  // Search list for the team.
3257  l = list;
3258  while (l->next != NULL && l->entry != team) {
3259  l = l->next;
3260  }
3261  if (l->next != NULL) {
3262  return; // Team has been added before, exit.
3263  }
3264 
3265  // Team is not found. Search list again for insertion point.
3266  l = list;
3267  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3268  l = l->next;
3269  }
3270 
3271  // Insert team.
3272  {
3273  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3274  sizeof(kmp_team_list_item_t));
3275  *item = *l;
3276  l->entry = team;
3277  l->next = item;
3278  }
3279 }
3280 
3281 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3282 
3283 ) {
3284  __kmp_printf("%s", title);
3285  if (team != NULL) {
3286  __kmp_printf("%2x %p\n", team->t.t_id, team);
3287  } else {
3288  __kmp_printf(" - (nil)\n");
3289  }
3290 }
3291 
3292 static void __kmp_print_structure_thread(char const *title,
3293  kmp_info_p const *thread) {
3294  __kmp_printf("%s", title);
3295  if (thread != NULL) {
3296  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3297  } else {
3298  __kmp_printf(" - (nil)\n");
3299  }
3300 }
3301 
3302 void __kmp_print_structure(void) {
3303 
3304  kmp_team_list_t list;
3305 
3306  // Initialize list of teams.
3307  list =
3308  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3309  list->entry = NULL;
3310  list->next = NULL;
3311 
3312  __kmp_printf("\n------------------------------\nGlobal Thread "
3313  "Table\n------------------------------\n");
3314  {
3315  int gtid;
3316  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3317  __kmp_printf("%2d", gtid);
3318  if (__kmp_threads != NULL) {
3319  __kmp_printf(" %p", __kmp_threads[gtid]);
3320  }
3321  if (__kmp_root != NULL) {
3322  __kmp_printf(" %p", __kmp_root[gtid]);
3323  }
3324  __kmp_printf("\n");
3325  }
3326  }
3327 
3328  // Print out __kmp_threads array.
3329  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3330  "----------\n");
3331  if (__kmp_threads != NULL) {
3332  int gtid;
3333  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3334  kmp_info_t const *thread = __kmp_threads[gtid];
3335  if (thread != NULL) {
3336  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3337  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3338  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3339  __kmp_print_structure_team(" Serial Team: ",
3340  thread->th.th_serial_team);
3341  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3342  __kmp_print_structure_thread(" Primary: ",
3343  thread->th.th_team_master);
3344  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3345  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3346  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3347  __kmp_print_structure_thread(" Next in pool: ",
3348  thread->th.th_next_pool);
3349  __kmp_printf("\n");
3350  __kmp_print_structure_team_accum(list, thread->th.th_team);
3351  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3352  }
3353  }
3354  } else {
3355  __kmp_printf("Threads array is not allocated.\n");
3356  }
3357 
3358  // Print out __kmp_root array.
3359  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3360  "--------\n");
3361  if (__kmp_root != NULL) {
3362  int gtid;
3363  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3364  kmp_root_t const *root = __kmp_root[gtid];
3365  if (root != NULL) {
3366  __kmp_printf("GTID %2d %p:\n", gtid, root);
3367  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3368  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3369  __kmp_print_structure_thread(" Uber Thread: ",
3370  root->r.r_uber_thread);
3371  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3372  __kmp_printf(" In Parallel: %2d\n",
3373  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3374  __kmp_printf("\n");
3375  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3376  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3377  }
3378  }
3379  } else {
3380  __kmp_printf("Ubers array is not allocated.\n");
3381  }
3382 
3383  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3384  "--------\n");
3385  while (list->next != NULL) {
3386  kmp_team_p const *team = list->entry;
3387  int i;
3388  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3389  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3390  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3391  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3392  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3393  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3394  for (i = 0; i < team->t.t_nproc; ++i) {
3395  __kmp_printf(" Thread %2d: ", i);
3396  __kmp_print_structure_thread("", team->t.t_threads[i]);
3397  }
3398  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3399  __kmp_printf("\n");
3400  list = list->next;
3401  }
3402 
3403  // Print out __kmp_thread_pool and __kmp_team_pool.
3404  __kmp_printf("\n------------------------------\nPools\n----------------------"
3405  "--------\n");
3406  __kmp_print_structure_thread("Thread pool: ",
3407  CCAST(kmp_info_t *, __kmp_thread_pool));
3408  __kmp_print_structure_team("Team pool: ",
3409  CCAST(kmp_team_t *, __kmp_team_pool));
3410  __kmp_printf("\n");
3411 
3412  // Free team list.
3413  while (list != NULL) {
3414  kmp_team_list_item_t *item = list;
3415  list = list->next;
3416  KMP_INTERNAL_FREE(item);
3417  }
3418 }
3419 
3420 #endif
3421 
3422 //---------------------------------------------------------------------------
3423 // Stuff for per-thread fast random number generator
3424 // Table of primes
3425 static const unsigned __kmp_primes[] = {
3426  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3427  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3428  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3429  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3430  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3431  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3432  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3433  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3434  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3435  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3436  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3437 
3438 //---------------------------------------------------------------------------
3439 // __kmp_get_random: Get a random number using a linear congruential method.
3440 unsigned short __kmp_get_random(kmp_info_t *thread) {
3441  unsigned x = thread->th.th_x;
3442  unsigned short r = (unsigned short)(x >> 16);
3443 
3444  thread->th.th_x = x * thread->th.th_a + 1;
3445 
3446  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3447  thread->th.th_info.ds.ds_tid, r));
3448 
3449  return r;
3450 }
3451 //--------------------------------------------------------
3452 // __kmp_init_random: Initialize a random number generator
3453 void __kmp_init_random(kmp_info_t *thread) {
3454  unsigned seed = thread->th.th_info.ds.ds_tid;
3455 
3456  thread->th.th_a =
3457  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3458  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3459  KA_TRACE(30,
3460  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3461 }
3462 
3463 #if KMP_OS_WINDOWS
3464 /* reclaim array entries for root threads that are already dead, returns number
3465  * reclaimed */
3466 static int __kmp_reclaim_dead_roots(void) {
3467  int i, r = 0;
3468 
3469  for (i = 0; i < __kmp_threads_capacity; ++i) {
3470  if (KMP_UBER_GTID(i) &&
3471  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3472  !__kmp_root[i]
3473  ->r.r_active) { // AC: reclaim only roots died in non-active state
3474  r += __kmp_unregister_root_other_thread(i);
3475  }
3476  }
3477  return r;
3478 }
3479 #endif
3480 
3481 /* This function attempts to create free entries in __kmp_threads and
3482  __kmp_root, and returns the number of free entries generated.
3483 
3484  For Windows* OS static library, the first mechanism used is to reclaim array
3485  entries for root threads that are already dead.
3486 
3487  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3488  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3489  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3490  threadprivate cache array has been created. Synchronization with
3491  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3492 
3493  After any dead root reclamation, if the clipping value allows array expansion
3494  to result in the generation of a total of nNeed free slots, the function does
3495  that expansion. If not, nothing is done beyond the possible initial root
3496  thread reclamation.
3497 
3498  If any argument is negative, the behavior is undefined. */
3499 static int __kmp_expand_threads(int nNeed) {
3500  int added = 0;
3501  int minimumRequiredCapacity;
3502  int newCapacity;
3503  kmp_info_t **newThreads;
3504  kmp_root_t **newRoot;
3505 
3506  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3507  // resizing __kmp_threads does not need additional protection if foreign
3508  // threads are present
3509 
3510 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3511  /* only for Windows static library */
3512  /* reclaim array entries for root threads that are already dead */
3513  added = __kmp_reclaim_dead_roots();
3514 
3515  if (nNeed) {
3516  nNeed -= added;
3517  if (nNeed < 0)
3518  nNeed = 0;
3519  }
3520 #endif
3521  if (nNeed <= 0)
3522  return added;
3523 
3524  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3525  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3526  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3527  // > __kmp_max_nth in one of two ways:
3528  //
3529  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3530  // may not be reused by another thread, so we may need to increase
3531  // __kmp_threads_capacity to __kmp_max_nth + 1.
3532  //
3533  // 2) New foreign root(s) are encountered. We always register new foreign
3534  // roots. This may cause a smaller # of threads to be allocated at
3535  // subsequent parallel regions, but the worker threads hang around (and
3536  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3537  //
3538  // Anyway, that is the reason for moving the check to see if
3539  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3540  // instead of having it performed here. -BB
3541 
3542  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3543 
3544  /* compute expansion headroom to check if we can expand */
3545  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3546  /* possible expansion too small -- give up */
3547  return added;
3548  }
3549  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3550 
3551  newCapacity = __kmp_threads_capacity;
3552  do {
3553  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3554  : __kmp_sys_max_nth;
3555  } while (newCapacity < minimumRequiredCapacity);
3556  newThreads = (kmp_info_t **)__kmp_allocate(
3557  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3558  newRoot =
3559  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3560  KMP_MEMCPY(newThreads, __kmp_threads,
3561  __kmp_threads_capacity * sizeof(kmp_info_t *));
3562  KMP_MEMCPY(newRoot, __kmp_root,
3563  __kmp_threads_capacity * sizeof(kmp_root_t *));
3564 
3565  kmp_info_t **temp_threads = __kmp_threads;
3566  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3567  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3568  __kmp_free(temp_threads);
3569  added += newCapacity - __kmp_threads_capacity;
3570  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3571 
3572  if (newCapacity > __kmp_tp_capacity) {
3573  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3574  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3575  __kmp_threadprivate_resize_cache(newCapacity);
3576  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3577  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3578  }
3579  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3580  }
3581 
3582  return added;
3583 }
3584 
3585 /* Register the current thread as a root thread and obtain our gtid. We must
3586  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3587  thread that calls from __kmp_do_serial_initialize() */
3588 int __kmp_register_root(int initial_thread) {
3589  kmp_info_t *root_thread;
3590  kmp_root_t *root;
3591  int gtid;
3592  int capacity;
3593  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3594  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3595  KMP_MB();
3596 
3597  /* 2007-03-02:
3598  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3599  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3600  work as expected -- it may return false (that means there is at least one
3601  empty slot in __kmp_threads array), but it is possible the only free slot
3602  is #0, which is reserved for initial thread and so cannot be used for this
3603  one. Following code workarounds this bug.
3604 
3605  However, right solution seems to be not reserving slot #0 for initial
3606  thread because:
3607  (1) there is no magic in slot #0,
3608  (2) we cannot detect initial thread reliably (the first thread which does
3609  serial initialization may be not a real initial thread).
3610  */
3611  capacity = __kmp_threads_capacity;
3612  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3613  --capacity;
3614  }
3615 
3616  // If it is not for initializing the hidden helper team, we need to take
3617  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3618  // in __kmp_threads_capacity.
3619  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3620  capacity -= __kmp_hidden_helper_threads_num;
3621  }
3622 
3623  /* see if there are too many threads */
3624  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3625  if (__kmp_tp_cached) {
3626  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3627  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3628  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3629  } else {
3630  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3631  __kmp_msg_null);
3632  }
3633  }
3634 
3635  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3636  // 0: initial thread, also a regular OpenMP thread.
3637  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3638  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3639  // regular OpenMP threads.
3640  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3641  // Find an available thread slot for hidden helper thread. Slots for hidden
3642  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3643  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3644  gtid <= __kmp_hidden_helper_threads_num;
3645  gtid++)
3646  ;
3647  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3648  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3649  "hidden helper thread: T#%d\n",
3650  gtid));
3651  } else {
3652  /* find an available thread slot */
3653  // Don't reassign the zero slot since we need that to only be used by
3654  // initial thread. Slots for hidden helper threads should also be skipped.
3655  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3656  gtid = 0;
3657  } else {
3658  for (gtid = __kmp_hidden_helper_threads_num + 1;
3659  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3660  ;
3661  }
3662  KA_TRACE(
3663  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3664  KMP_ASSERT(gtid < __kmp_threads_capacity);
3665  }
3666 
3667  /* update global accounting */
3668  __kmp_all_nth++;
3669  TCW_4(__kmp_nth, __kmp_nth + 1);
3670 
3671  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3672  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3673  if (__kmp_adjust_gtid_mode) {
3674  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3675  if (TCR_4(__kmp_gtid_mode) != 2) {
3676  TCW_4(__kmp_gtid_mode, 2);
3677  }
3678  } else {
3679  if (TCR_4(__kmp_gtid_mode) != 1) {
3680  TCW_4(__kmp_gtid_mode, 1);
3681  }
3682  }
3683  }
3684 
3685 #ifdef KMP_ADJUST_BLOCKTIME
3686  /* Adjust blocktime to zero if necessary */
3687  /* Middle initialization might not have occurred yet */
3688  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3689  if (__kmp_nth > __kmp_avail_proc) {
3690  __kmp_zero_bt = TRUE;
3691  }
3692  }
3693 #endif /* KMP_ADJUST_BLOCKTIME */
3694 
3695  /* setup this new hierarchy */
3696  if (!(root = __kmp_root[gtid])) {
3697  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3698  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3699  }
3700 
3701 #if KMP_STATS_ENABLED
3702  // Initialize stats as soon as possible (right after gtid assignment).
3703  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3704  __kmp_stats_thread_ptr->startLife();
3705  KMP_SET_THREAD_STATE(SERIAL_REGION);
3706  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3707 #endif
3708  __kmp_initialize_root(root);
3709 
3710  /* setup new root thread structure */
3711  if (root->r.r_uber_thread) {
3712  root_thread = root->r.r_uber_thread;
3713  } else {
3714  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3715  if (__kmp_storage_map) {
3716  __kmp_print_thread_storage_map(root_thread, gtid);
3717  }
3718  root_thread->th.th_info.ds.ds_gtid = gtid;
3719 #if OMPT_SUPPORT
3720  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3721 #endif
3722  root_thread->th.th_root = root;
3723  if (__kmp_env_consistency_check) {
3724  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3725  }
3726 #if USE_FAST_MEMORY
3727  __kmp_initialize_fast_memory(root_thread);
3728 #endif /* USE_FAST_MEMORY */
3729 
3730 #if KMP_USE_BGET
3731  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3732  __kmp_initialize_bget(root_thread);
3733 #endif
3734  __kmp_init_random(root_thread); // Initialize random number generator
3735  }
3736 
3737  /* setup the serial team held in reserve by the root thread */
3738  if (!root_thread->th.th_serial_team) {
3739  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3740  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3741  root_thread->th.th_serial_team = __kmp_allocate_team(
3742  root, 1, 1,
3743 #if OMPT_SUPPORT
3744  ompt_data_none, // root parallel id
3745 #endif
3746  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3747  }
3748  KMP_ASSERT(root_thread->th.th_serial_team);
3749  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3750  root_thread->th.th_serial_team));
3751 
3752  /* drop root_thread into place */
3753  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3754 
3755  root->r.r_root_team->t.t_threads[0] = root_thread;
3756  root->r.r_hot_team->t.t_threads[0] = root_thread;
3757  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3758  // AC: the team created in reserve, not for execution (it is unused for now).
3759  root_thread->th.th_serial_team->t.t_serialized = 0;
3760  root->r.r_uber_thread = root_thread;
3761 
3762  /* initialize the thread, get it ready to go */
3763  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3764  TCW_4(__kmp_init_gtid, TRUE);
3765 
3766  /* prepare the primary thread for get_gtid() */
3767  __kmp_gtid_set_specific(gtid);
3768 
3769 #if USE_ITT_BUILD
3770  __kmp_itt_thread_name(gtid);
3771 #endif /* USE_ITT_BUILD */
3772 
3773 #ifdef KMP_TDATA_GTID
3774  __kmp_gtid = gtid;
3775 #endif
3776  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3777  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3778 
3779  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3780  "plain=%u\n",
3781  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3782  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3783  KMP_INIT_BARRIER_STATE));
3784  { // Initialize barrier data.
3785  int b;
3786  for (b = 0; b < bs_last_barrier; ++b) {
3787  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3788 #if USE_DEBUGGER
3789  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3790 #endif
3791  }
3792  }
3793  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3794  KMP_INIT_BARRIER_STATE);
3795 
3796 #if KMP_AFFINITY_SUPPORTED
3797  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3798  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3799  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3800  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3801  if (TCR_4(__kmp_init_middle)) {
3802  __kmp_affinity_set_init_mask(gtid, TRUE);
3803  }
3804 #endif /* KMP_AFFINITY_SUPPORTED */
3805  root_thread->th.th_def_allocator = __kmp_def_allocator;
3806  root_thread->th.th_prev_level = 0;
3807  root_thread->th.th_prev_num_threads = 1;
3808 
3809  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3810  tmp->cg_root = root_thread;
3811  tmp->cg_thread_limit = __kmp_cg_max_nth;
3812  tmp->cg_nthreads = 1;
3813  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3814  " cg_nthreads init to 1\n",
3815  root_thread, tmp));
3816  tmp->up = NULL;
3817  root_thread->th.th_cg_roots = tmp;
3818 
3819  __kmp_root_counter++;
3820 
3821 #if OMPT_SUPPORT
3822  if (!initial_thread && ompt_enabled.enabled) {
3823 
3824  kmp_info_t *root_thread = ompt_get_thread();
3825 
3826  ompt_set_thread_state(root_thread, ompt_state_overhead);
3827 
3828  if (ompt_enabled.ompt_callback_thread_begin) {
3829  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3830  ompt_thread_initial, __ompt_get_thread_data_internal());
3831  }
3832  ompt_data_t *task_data;
3833  ompt_data_t *parallel_data;
3834  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3835  NULL);
3836  if (ompt_enabled.ompt_callback_implicit_task) {
3837  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3838  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3839  }
3840 
3841  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3842  }
3843 #endif
3844 
3845  KMP_MB();
3846  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3847 
3848  return gtid;
3849 }
3850 
3851 #if KMP_NESTED_HOT_TEAMS
3852 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3853  const int max_level) {
3854  int i, n, nth;
3855  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3856  if (!hot_teams || !hot_teams[level].hot_team) {
3857  return 0;
3858  }
3859  KMP_DEBUG_ASSERT(level < max_level);
3860  kmp_team_t *team = hot_teams[level].hot_team;
3861  nth = hot_teams[level].hot_team_nth;
3862  n = nth - 1; // primary thread is not freed
3863  if (level < max_level - 1) {
3864  for (i = 0; i < nth; ++i) {
3865  kmp_info_t *th = team->t.t_threads[i];
3866  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3867  if (i > 0 && th->th.th_hot_teams) {
3868  __kmp_free(th->th.th_hot_teams);
3869  th->th.th_hot_teams = NULL;
3870  }
3871  }
3872  }
3873  __kmp_free_team(root, team, NULL);
3874  return n;
3875 }
3876 #endif
3877 
3878 // Resets a root thread and clear its root and hot teams.
3879 // Returns the number of __kmp_threads entries directly and indirectly freed.
3880 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3881  kmp_team_t *root_team = root->r.r_root_team;
3882  kmp_team_t *hot_team = root->r.r_hot_team;
3883  int n = hot_team->t.t_nproc;
3884  int i;
3885 
3886  KMP_DEBUG_ASSERT(!root->r.r_active);
3887 
3888  root->r.r_root_team = NULL;
3889  root->r.r_hot_team = NULL;
3890  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3891  // before call to __kmp_free_team().
3892  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3893 #if KMP_NESTED_HOT_TEAMS
3894  if (__kmp_hot_teams_max_level >
3895  0) { // need to free nested hot teams and their threads if any
3896  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3897  kmp_info_t *th = hot_team->t.t_threads[i];
3898  if (__kmp_hot_teams_max_level > 1) {
3899  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3900  }
3901  if (th->th.th_hot_teams) {
3902  __kmp_free(th->th.th_hot_teams);
3903  th->th.th_hot_teams = NULL;
3904  }
3905  }
3906  }
3907 #endif
3908  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3909 
3910  // Before we can reap the thread, we need to make certain that all other
3911  // threads in the teams that had this root as ancestor have stopped trying to
3912  // steal tasks.
3913  if (__kmp_tasking_mode != tskm_immediate_exec) {
3914  __kmp_wait_to_unref_task_teams();
3915  }
3916 
3917 #if KMP_OS_WINDOWS
3918  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3919  KA_TRACE(
3920  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3921  "\n",
3922  (LPVOID) & (root->r.r_uber_thread->th),
3923  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3924  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3925 #endif /* KMP_OS_WINDOWS */
3926 
3927 #if OMPT_SUPPORT
3928  ompt_data_t *task_data;
3929  ompt_data_t *parallel_data;
3930  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3931  NULL);
3932  if (ompt_enabled.ompt_callback_implicit_task) {
3933  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3934  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3935  }
3936  if (ompt_enabled.ompt_callback_thread_end) {
3937  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3938  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3939  }
3940 #endif
3941 
3942  TCW_4(__kmp_nth,
3943  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3944  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3945  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3946  " to %d\n",
3947  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3948  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3949  if (i == 1) {
3950  // need to free contention group structure
3951  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3952  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3953  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3954  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3955  root->r.r_uber_thread->th.th_cg_roots = NULL;
3956  }
3957  __kmp_reap_thread(root->r.r_uber_thread, 1);
3958 
3959  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3960  // instead of freeing.
3961  root->r.r_uber_thread = NULL;
3962  /* mark root as no longer in use */
3963  root->r.r_begin = FALSE;
3964 
3965  return n;
3966 }
3967 
3968 void __kmp_unregister_root_current_thread(int gtid) {
3969  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3970  /* this lock should be ok, since unregister_root_current_thread is never
3971  called during an abort, only during a normal close. furthermore, if you
3972  have the forkjoin lock, you should never try to get the initz lock */
3973  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3974  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3975  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3976  "exiting T#%d\n",
3977  gtid));
3978  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3979  return;
3980  }
3981  kmp_root_t *root = __kmp_root[gtid];
3982 
3983  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3984  KMP_ASSERT(KMP_UBER_GTID(gtid));
3985  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3986  KMP_ASSERT(root->r.r_active == FALSE);
3987 
3988  KMP_MB();
3989 
3990  kmp_info_t *thread = __kmp_threads[gtid];
3991  kmp_team_t *team = thread->th.th_team;
3992  kmp_task_team_t *task_team = thread->th.th_task_team;
3993 
3994  // we need to wait for the proxy tasks before finishing the thread
3995  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3996 #if OMPT_SUPPORT
3997  // the runtime is shutting down so we won't report any events
3998  thread->th.ompt_thread_info.state = ompt_state_undefined;
3999 #endif
4000  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4001  }
4002 
4003  __kmp_reset_root(gtid, root);
4004 
4005  KMP_MB();
4006  KC_TRACE(10,
4007  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4008 
4009  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4010 }
4011 
4012 #if KMP_OS_WINDOWS
4013 /* __kmp_forkjoin_lock must be already held
4014  Unregisters a root thread that is not the current thread. Returns the number
4015  of __kmp_threads entries freed as a result. */
4016 static int __kmp_unregister_root_other_thread(int gtid) {
4017  kmp_root_t *root = __kmp_root[gtid];
4018  int r;
4019 
4020  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4021  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4022  KMP_ASSERT(KMP_UBER_GTID(gtid));
4023  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4024  KMP_ASSERT(root->r.r_active == FALSE);
4025 
4026  r = __kmp_reset_root(gtid, root);
4027  KC_TRACE(10,
4028  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4029  return r;
4030 }
4031 #endif
4032 
4033 #if KMP_DEBUG
4034 void __kmp_task_info() {
4035 
4036  kmp_int32 gtid = __kmp_entry_gtid();
4037  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4038  kmp_info_t *this_thr = __kmp_threads[gtid];
4039  kmp_team_t *steam = this_thr->th.th_serial_team;
4040  kmp_team_t *team = this_thr->th.th_team;
4041 
4042  __kmp_printf(
4043  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4044  "ptask=%p\n",
4045  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4046  team->t.t_implicit_task_taskdata[tid].td_parent);
4047 }
4048 #endif // KMP_DEBUG
4049 
4050 /* TODO optimize with one big memclr, take out what isn't needed, split
4051  responsibility to workers as much as possible, and delay initialization of
4052  features as much as possible */
4053 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4054  int tid, int gtid) {
4055  /* this_thr->th.th_info.ds.ds_gtid is setup in
4056  kmp_allocate_thread/create_worker.
4057  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4058  KMP_DEBUG_ASSERT(this_thr != NULL);
4059  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4060  KMP_DEBUG_ASSERT(team);
4061  KMP_DEBUG_ASSERT(team->t.t_threads);
4062  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4063  kmp_info_t *master = team->t.t_threads[0];
4064  KMP_DEBUG_ASSERT(master);
4065  KMP_DEBUG_ASSERT(master->th.th_root);
4066 
4067  KMP_MB();
4068 
4069  TCW_SYNC_PTR(this_thr->th.th_team, team);
4070 
4071  this_thr->th.th_info.ds.ds_tid = tid;
4072  this_thr->th.th_set_nproc = 0;
4073  if (__kmp_tasking_mode != tskm_immediate_exec)
4074  // When tasking is possible, threads are not safe to reap until they are
4075  // done tasking; this will be set when tasking code is exited in wait
4076  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4077  else // no tasking --> always safe to reap
4078  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4079  this_thr->th.th_set_proc_bind = proc_bind_default;
4080 #if KMP_AFFINITY_SUPPORTED
4081  this_thr->th.th_new_place = this_thr->th.th_current_place;
4082 #endif
4083  this_thr->th.th_root = master->th.th_root;
4084 
4085  /* setup the thread's cache of the team structure */
4086  this_thr->th.th_team_nproc = team->t.t_nproc;
4087  this_thr->th.th_team_master = master;
4088  this_thr->th.th_team_serialized = team->t.t_serialized;
4089  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4090 
4091  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4092 
4093  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4094  tid, gtid, this_thr, this_thr->th.th_current_task));
4095 
4096  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4097  team, tid, TRUE);
4098 
4099  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4100  tid, gtid, this_thr, this_thr->th.th_current_task));
4101  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4102  // __kmp_initialize_team()?
4103 
4104  /* TODO no worksharing in speculative threads */
4105  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4106 
4107  this_thr->th.th_local.this_construct = 0;
4108 
4109  if (!this_thr->th.th_pri_common) {
4110  this_thr->th.th_pri_common =
4111  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4112  if (__kmp_storage_map) {
4113  __kmp_print_storage_map_gtid(
4114  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4115  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4116  }
4117  this_thr->th.th_pri_head = NULL;
4118  }
4119 
4120  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4121  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4122  // Make new thread's CG root same as primary thread's
4123  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4124  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4125  if (tmp) {
4126  // worker changes CG, need to check if old CG should be freed
4127  int i = tmp->cg_nthreads--;
4128  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4129  " on node %p of thread %p to %d\n",
4130  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4131  if (i == 1) {
4132  __kmp_free(tmp); // last thread left CG --> free it
4133  }
4134  }
4135  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4136  // Increment new thread's CG root's counter to add the new thread
4137  this_thr->th.th_cg_roots->cg_nthreads++;
4138  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4139  " node %p of thread %p to %d\n",
4140  this_thr, this_thr->th.th_cg_roots,
4141  this_thr->th.th_cg_roots->cg_root,
4142  this_thr->th.th_cg_roots->cg_nthreads));
4143  this_thr->th.th_current_task->td_icvs.thread_limit =
4144  this_thr->th.th_cg_roots->cg_thread_limit;
4145  }
4146 
4147  /* Initialize dynamic dispatch */
4148  {
4149  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4150  // Use team max_nproc since this will never change for the team.
4151  size_t disp_size =
4152  sizeof(dispatch_private_info_t) *
4153  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4154  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4155  team->t.t_max_nproc));
4156  KMP_ASSERT(dispatch);
4157  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4158  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4159 
4160  dispatch->th_disp_index = 0;
4161  dispatch->th_doacross_buf_idx = 0;
4162  if (!dispatch->th_disp_buffer) {
4163  dispatch->th_disp_buffer =
4164  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4165 
4166  if (__kmp_storage_map) {
4167  __kmp_print_storage_map_gtid(
4168  gtid, &dispatch->th_disp_buffer[0],
4169  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4170  ? 1
4171  : __kmp_dispatch_num_buffers],
4172  disp_size,
4173  "th_%d.th_dispatch.th_disp_buffer "
4174  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4175  gtid, team->t.t_id, gtid);
4176  }
4177  } else {
4178  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4179  }
4180 
4181  dispatch->th_dispatch_pr_current = 0;
4182  dispatch->th_dispatch_sh_current = 0;
4183 
4184  dispatch->th_deo_fcn = 0; /* ORDERED */
4185  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4186  }
4187 
4188  this_thr->th.th_next_pool = NULL;
4189 
4190  if (!this_thr->th.th_task_state_memo_stack) {
4191  size_t i;
4192  this_thr->th.th_task_state_memo_stack =
4193  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4194  this_thr->th.th_task_state_top = 0;
4195  this_thr->th.th_task_state_stack_sz = 4;
4196  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4197  ++i) // zero init the stack
4198  this_thr->th.th_task_state_memo_stack[i] = 0;
4199  }
4200 
4201  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4202  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4203 
4204  KMP_MB();
4205 }
4206 
4207 /* allocate a new thread for the requesting team. this is only called from
4208  within a forkjoin critical section. we will first try to get an available
4209  thread from the thread pool. if none is available, we will fork a new one
4210  assuming we are able to create a new one. this should be assured, as the
4211  caller should check on this first. */
4212 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4213  int new_tid) {
4214  kmp_team_t *serial_team;
4215  kmp_info_t *new_thr;
4216  int new_gtid;
4217 
4218  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4219  KMP_DEBUG_ASSERT(root && team);
4220 #if !KMP_NESTED_HOT_TEAMS
4221  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4222 #endif
4223  KMP_MB();
4224 
4225  /* first, try to get one from the thread pool */
4226  if (__kmp_thread_pool) {
4227  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4228  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4229  if (new_thr == __kmp_thread_pool_insert_pt) {
4230  __kmp_thread_pool_insert_pt = NULL;
4231  }
4232  TCW_4(new_thr->th.th_in_pool, FALSE);
4233  __kmp_suspend_initialize_thread(new_thr);
4234  __kmp_lock_suspend_mx(new_thr);
4235  if (new_thr->th.th_active_in_pool == TRUE) {
4236  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4237  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4238  new_thr->th.th_active_in_pool = FALSE;
4239  }
4240  __kmp_unlock_suspend_mx(new_thr);
4241 
4242  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4243  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4244  KMP_ASSERT(!new_thr->th.th_team);
4245  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4246 
4247  /* setup the thread structure */
4248  __kmp_initialize_info(new_thr, team, new_tid,
4249  new_thr->th.th_info.ds.ds_gtid);
4250  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4251 
4252  TCW_4(__kmp_nth, __kmp_nth + 1);
4253 
4254  new_thr->th.th_task_state = 0;
4255  new_thr->th.th_task_state_top = 0;
4256  new_thr->th.th_task_state_stack_sz = 4;
4257 
4258 #ifdef KMP_ADJUST_BLOCKTIME
4259  /* Adjust blocktime back to zero if necessary */
4260  /* Middle initialization might not have occurred yet */
4261  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4262  if (__kmp_nth > __kmp_avail_proc) {
4263  __kmp_zero_bt = TRUE;
4264  }
4265  }
4266 #endif /* KMP_ADJUST_BLOCKTIME */
4267 
4268 #if KMP_DEBUG
4269  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4270  // KMP_BARRIER_PARENT_FLAG.
4271  int b;
4272  kmp_balign_t *balign = new_thr->th.th_bar;
4273  for (b = 0; b < bs_last_barrier; ++b)
4274  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4275 #endif
4276 
4277  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4278  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4279 
4280  KMP_MB();
4281  return new_thr;
4282  }
4283 
4284  /* no, well fork a new one */
4285  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4286  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4287 
4288 #if KMP_USE_MONITOR
4289  // If this is the first worker thread the RTL is creating, then also
4290  // launch the monitor thread. We try to do this as early as possible.
4291  if (!TCR_4(__kmp_init_monitor)) {
4292  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4293  if (!TCR_4(__kmp_init_monitor)) {
4294  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4295  TCW_4(__kmp_init_monitor, 1);
4296  __kmp_create_monitor(&__kmp_monitor);
4297  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4298 #if KMP_OS_WINDOWS
4299  // AC: wait until monitor has started. This is a fix for CQ232808.
4300  // The reason is that if the library is loaded/unloaded in a loop with
4301  // small (parallel) work in between, then there is high probability that
4302  // monitor thread started after the library shutdown. At shutdown it is
4303  // too late to cope with the problem, because when the primary thread is
4304  // in DllMain (process detach) the monitor has no chances to start (it is
4305  // blocked), and primary thread has no means to inform the monitor that
4306  // the library has gone, because all the memory which the monitor can
4307  // access is going to be released/reset.
4308  while (TCR_4(__kmp_init_monitor) < 2) {
4309  KMP_YIELD(TRUE);
4310  }
4311  KF_TRACE(10, ("after monitor thread has started\n"));
4312 #endif
4313  }
4314  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4315  }
4316 #endif
4317 
4318  KMP_MB();
4319 
4320  {
4321  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4322  ? 1
4323  : __kmp_hidden_helper_threads_num + 1;
4324 
4325  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4326  ++new_gtid) {
4327  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4328  }
4329 
4330  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4331  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4332  }
4333  }
4334 
4335  /* allocate space for it. */
4336  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4337 
4338  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4339 
4340 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4341  // suppress race conditions detection on synchronization flags in debug mode
4342  // this helps to analyze library internals eliminating false positives
4343  __itt_suppress_mark_range(
4344  __itt_suppress_range, __itt_suppress_threading_errors,
4345  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4346  __itt_suppress_mark_range(
4347  __itt_suppress_range, __itt_suppress_threading_errors,
4348  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4349 #if KMP_OS_WINDOWS
4350  __itt_suppress_mark_range(
4351  __itt_suppress_range, __itt_suppress_threading_errors,
4352  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4353 #else
4354  __itt_suppress_mark_range(__itt_suppress_range,
4355  __itt_suppress_threading_errors,
4356  &new_thr->th.th_suspend_init_count,
4357  sizeof(new_thr->th.th_suspend_init_count));
4358 #endif
4359  // TODO: check if we need to also suppress b_arrived flags
4360  __itt_suppress_mark_range(__itt_suppress_range,
4361  __itt_suppress_threading_errors,
4362  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4363  sizeof(new_thr->th.th_bar[0].bb.b_go));
4364  __itt_suppress_mark_range(__itt_suppress_range,
4365  __itt_suppress_threading_errors,
4366  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4367  sizeof(new_thr->th.th_bar[1].bb.b_go));
4368  __itt_suppress_mark_range(__itt_suppress_range,
4369  __itt_suppress_threading_errors,
4370  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4371  sizeof(new_thr->th.th_bar[2].bb.b_go));
4372 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4373  if (__kmp_storage_map) {
4374  __kmp_print_thread_storage_map(new_thr, new_gtid);
4375  }
4376 
4377  // add the reserve serialized team, initialized from the team's primary thread
4378  {
4379  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4380  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4381  new_thr->th.th_serial_team = serial_team =
4382  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4383 #if OMPT_SUPPORT
4384  ompt_data_none, // root parallel id
4385 #endif
4386  proc_bind_default, &r_icvs,
4387  0 USE_NESTED_HOT_ARG(NULL));
4388  }
4389  KMP_ASSERT(serial_team);
4390  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4391  // execution (it is unused for now).
4392  serial_team->t.t_threads[0] = new_thr;
4393  KF_TRACE(10,
4394  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4395  new_thr));
4396 
4397  /* setup the thread structures */
4398  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4399 
4400 #if USE_FAST_MEMORY
4401  __kmp_initialize_fast_memory(new_thr);
4402 #endif /* USE_FAST_MEMORY */
4403 
4404 #if KMP_USE_BGET
4405  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4406  __kmp_initialize_bget(new_thr);
4407 #endif
4408 
4409  __kmp_init_random(new_thr); // Initialize random number generator
4410 
4411  /* Initialize these only once when thread is grabbed for a team allocation */
4412  KA_TRACE(20,
4413  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4414  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4415 
4416  int b;
4417  kmp_balign_t *balign = new_thr->th.th_bar;
4418  for (b = 0; b < bs_last_barrier; ++b) {
4419  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4420  balign[b].bb.team = NULL;
4421  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4422  balign[b].bb.use_oncore_barrier = 0;
4423  }
4424 
4425  new_thr->th.th_spin_here = FALSE;
4426  new_thr->th.th_next_waiting = 0;
4427 #if KMP_OS_UNIX
4428  new_thr->th.th_blocking = false;
4429 #endif
4430 
4431 #if KMP_AFFINITY_SUPPORTED
4432  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4433  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4434  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4435  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4436 #endif
4437  new_thr->th.th_def_allocator = __kmp_def_allocator;
4438  new_thr->th.th_prev_level = 0;
4439  new_thr->th.th_prev_num_threads = 1;
4440 
4441  TCW_4(new_thr->th.th_in_pool, FALSE);
4442  new_thr->th.th_active_in_pool = FALSE;
4443  TCW_4(new_thr->th.th_active, TRUE);
4444 
4445  /* adjust the global counters */
4446  __kmp_all_nth++;
4447  __kmp_nth++;
4448 
4449  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4450  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4451  if (__kmp_adjust_gtid_mode) {
4452  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4453  if (TCR_4(__kmp_gtid_mode) != 2) {
4454  TCW_4(__kmp_gtid_mode, 2);
4455  }
4456  } else {
4457  if (TCR_4(__kmp_gtid_mode) != 1) {
4458  TCW_4(__kmp_gtid_mode, 1);
4459  }
4460  }
4461  }
4462 
4463 #ifdef KMP_ADJUST_BLOCKTIME
4464  /* Adjust blocktime back to zero if necessary */
4465  /* Middle initialization might not have occurred yet */
4466  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4467  if (__kmp_nth > __kmp_avail_proc) {
4468  __kmp_zero_bt = TRUE;
4469  }
4470  }
4471 #endif /* KMP_ADJUST_BLOCKTIME */
4472 
4473  /* actually fork it and create the new worker thread */
4474  KF_TRACE(
4475  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4476  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4477  KF_TRACE(10,
4478  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4479 
4480  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4481  new_gtid));
4482  KMP_MB();
4483  return new_thr;
4484 }
4485 
4486 /* Reinitialize team for reuse.
4487  The hot team code calls this case at every fork barrier, so EPCC barrier
4488  test are extremely sensitive to changes in it, esp. writes to the team
4489  struct, which cause a cache invalidation in all threads.
4490  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4491 static void __kmp_reinitialize_team(kmp_team_t *team,
4492  kmp_internal_control_t *new_icvs,
4493  ident_t *loc) {
4494  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4495  team->t.t_threads[0], team));
4496  KMP_DEBUG_ASSERT(team && new_icvs);
4497  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4498  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4499 
4500  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4501  // Copy ICVs to the primary thread's implicit taskdata
4502  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4503  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4504 
4505  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4506  team->t.t_threads[0], team));
4507 }
4508 
4509 /* Initialize the team data structure.
4510  This assumes the t_threads and t_max_nproc are already set.
4511  Also, we don't touch the arguments */
4512 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4513  kmp_internal_control_t *new_icvs,
4514  ident_t *loc) {
4515  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4516 
4517  /* verify */
4518  KMP_DEBUG_ASSERT(team);
4519  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4520  KMP_DEBUG_ASSERT(team->t.t_threads);
4521  KMP_MB();
4522 
4523  team->t.t_master_tid = 0; /* not needed */
4524  /* team->t.t_master_bar; not needed */
4525  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4526  team->t.t_nproc = new_nproc;
4527 
4528  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4529  team->t.t_next_pool = NULL;
4530  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4531  * up hot team */
4532 
4533  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4534  team->t.t_invoke = NULL; /* not needed */
4535 
4536  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4537  team->t.t_sched.sched = new_icvs->sched.sched;
4538 
4539 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4540  team->t.t_fp_control_saved = FALSE; /* not needed */
4541  team->t.t_x87_fpu_control_word = 0; /* not needed */
4542  team->t.t_mxcsr = 0; /* not needed */
4543 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4544 
4545  team->t.t_construct = 0;
4546 
4547  team->t.t_ordered.dt.t_value = 0;
4548  team->t.t_master_active = FALSE;
4549 
4550 #ifdef KMP_DEBUG
4551  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4552 #endif
4553 #if KMP_OS_WINDOWS
4554  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4555 #endif
4556 
4557  team->t.t_control_stack_top = NULL;
4558 
4559  __kmp_reinitialize_team(team, new_icvs, loc);
4560 
4561  KMP_MB();
4562  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4563 }
4564 
4565 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4566 /* Sets full mask for thread and returns old mask, no changes to structures. */
4567 static void
4568 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4569  if (KMP_AFFINITY_CAPABLE()) {
4570  int status;
4571  if (old_mask != NULL) {
4572  status = __kmp_get_system_affinity(old_mask, TRUE);
4573  int error = errno;
4574  if (status != 0) {
4575  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4576  __kmp_msg_null);
4577  }
4578  }
4579  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4580  }
4581 }
4582 #endif
4583 
4584 #if KMP_AFFINITY_SUPPORTED
4585 
4586 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4587 // It calculates the worker + primary thread's partition based upon the parent
4588 // thread's partition, and binds each worker to a thread in their partition.
4589 // The primary thread's partition should already include its current binding.
4590 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4591  // Copy the primary thread's place partition to the team struct
4592  kmp_info_t *master_th = team->t.t_threads[0];
4593  KMP_DEBUG_ASSERT(master_th != NULL);
4594  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4595  int first_place = master_th->th.th_first_place;
4596  int last_place = master_th->th.th_last_place;
4597  int masters_place = master_th->th.th_current_place;
4598  team->t.t_first_place = first_place;
4599  team->t.t_last_place = last_place;
4600 
4601  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4602  "bound to place %d partition = [%d,%d]\n",
4603  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4604  team->t.t_id, masters_place, first_place, last_place));
4605 
4606  switch (proc_bind) {
4607 
4608  case proc_bind_default:
4609  // Serial teams might have the proc_bind policy set to proc_bind_default.
4610  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4611  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4612  break;
4613 
4614  case proc_bind_primary: {
4615  int f;
4616  int n_th = team->t.t_nproc;
4617  for (f = 1; f < n_th; f++) {
4618  kmp_info_t *th = team->t.t_threads[f];
4619  KMP_DEBUG_ASSERT(th != NULL);
4620  th->th.th_first_place = first_place;
4621  th->th.th_last_place = last_place;
4622  th->th.th_new_place = masters_place;
4623  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4624  team->t.t_display_affinity != 1) {
4625  team->t.t_display_affinity = 1;
4626  }
4627 
4628  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4629  "partition = [%d,%d]\n",
4630  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4631  f, masters_place, first_place, last_place));
4632  }
4633  } break;
4634 
4635  case proc_bind_close: {
4636  int f;
4637  int n_th = team->t.t_nproc;
4638  int n_places;
4639  if (first_place <= last_place) {
4640  n_places = last_place - first_place + 1;
4641  } else {
4642  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4643  }
4644  if (n_th <= n_places) {
4645  int place = masters_place;
4646  for (f = 1; f < n_th; f++) {
4647  kmp_info_t *th = team->t.t_threads[f];
4648  KMP_DEBUG_ASSERT(th != NULL);
4649 
4650  if (place == last_place) {
4651  place = first_place;
4652  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4653  place = 0;
4654  } else {
4655  place++;
4656  }
4657  th->th.th_first_place = first_place;
4658  th->th.th_last_place = last_place;
4659  th->th.th_new_place = place;
4660  if (__kmp_display_affinity && place != th->th.th_current_place &&
4661  team->t.t_display_affinity != 1) {
4662  team->t.t_display_affinity = 1;
4663  }
4664 
4665  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4666  "partition = [%d,%d]\n",
4667  __kmp_gtid_from_thread(team->t.t_threads[f]),
4668  team->t.t_id, f, place, first_place, last_place));
4669  }
4670  } else {
4671  int S, rem, gap, s_count;
4672  S = n_th / n_places;
4673  s_count = 0;
4674  rem = n_th - (S * n_places);
4675  gap = rem > 0 ? n_places / rem : n_places;
4676  int place = masters_place;
4677  int gap_ct = gap;
4678  for (f = 0; f < n_th; f++) {
4679  kmp_info_t *th = team->t.t_threads[f];
4680  KMP_DEBUG_ASSERT(th != NULL);
4681 
4682  th->th.th_first_place = first_place;
4683  th->th.th_last_place = last_place;
4684  th->th.th_new_place = place;
4685  if (__kmp_display_affinity && place != th->th.th_current_place &&
4686  team->t.t_display_affinity != 1) {
4687  team->t.t_display_affinity = 1;
4688  }
4689  s_count++;
4690 
4691  if ((s_count == S) && rem && (gap_ct == gap)) {
4692  // do nothing, add an extra thread to place on next iteration
4693  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4694  // we added an extra thread to this place; move to next place
4695  if (place == last_place) {
4696  place = first_place;
4697  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4698  place = 0;
4699  } else {
4700  place++;
4701  }
4702  s_count = 0;
4703  gap_ct = 1;
4704  rem--;
4705  } else if (s_count == S) { // place full; don't add extra
4706  if (place == last_place) {
4707  place = first_place;
4708  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4709  place = 0;
4710  } else {
4711  place++;
4712  }
4713  gap_ct++;
4714  s_count = 0;
4715  }
4716 
4717  KA_TRACE(100,
4718  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4719  "partition = [%d,%d]\n",
4720  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4721  th->th.th_new_place, first_place, last_place));
4722  }
4723  KMP_DEBUG_ASSERT(place == masters_place);
4724  }
4725  } break;
4726 
4727  case proc_bind_spread: {
4728  int f;
4729  int n_th = team->t.t_nproc;
4730  int n_places;
4731  int thidx;
4732  if (first_place <= last_place) {
4733  n_places = last_place - first_place + 1;
4734  } else {
4735  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4736  }
4737  if (n_th <= n_places) {
4738  int place = -1;
4739 
4740  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4741  int S = n_places / n_th;
4742  int s_count, rem, gap, gap_ct;
4743 
4744  place = masters_place;
4745  rem = n_places - n_th * S;
4746  gap = rem ? n_th / rem : 1;
4747  gap_ct = gap;
4748  thidx = n_th;
4749  if (update_master_only == 1)
4750  thidx = 1;
4751  for (f = 0; f < thidx; f++) {
4752  kmp_info_t *th = team->t.t_threads[f];
4753  KMP_DEBUG_ASSERT(th != NULL);
4754 
4755  th->th.th_first_place = place;
4756  th->th.th_new_place = place;
4757  if (__kmp_display_affinity && place != th->th.th_current_place &&
4758  team->t.t_display_affinity != 1) {
4759  team->t.t_display_affinity = 1;
4760  }
4761  s_count = 1;
4762  while (s_count < S) {
4763  if (place == last_place) {
4764  place = first_place;
4765  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4766  place = 0;
4767  } else {
4768  place++;
4769  }
4770  s_count++;
4771  }
4772  if (rem && (gap_ct == gap)) {
4773  if (place == last_place) {
4774  place = first_place;
4775  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4776  place = 0;
4777  } else {
4778  place++;
4779  }
4780  rem--;
4781  gap_ct = 0;
4782  }
4783  th->th.th_last_place = place;
4784  gap_ct++;
4785 
4786  if (place == last_place) {
4787  place = first_place;
4788  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4789  place = 0;
4790  } else {
4791  place++;
4792  }
4793 
4794  KA_TRACE(100,
4795  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4796  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4797  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4798  f, th->th.th_new_place, th->th.th_first_place,
4799  th->th.th_last_place, __kmp_affinity_num_masks));
4800  }
4801  } else {
4802  /* Having uniform space of available computation places I can create
4803  T partitions of round(P/T) size and put threads into the first
4804  place of each partition. */
4805  double current = static_cast<double>(masters_place);
4806  double spacing =
4807  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4808  int first, last;
4809  kmp_info_t *th;
4810 
4811  thidx = n_th + 1;
4812  if (update_master_only == 1)
4813  thidx = 1;
4814  for (f = 0; f < thidx; f++) {
4815  first = static_cast<int>(current);
4816  last = static_cast<int>(current + spacing) - 1;
4817  KMP_DEBUG_ASSERT(last >= first);
4818  if (first >= n_places) {
4819  if (masters_place) {
4820  first -= n_places;
4821  last -= n_places;
4822  if (first == (masters_place + 1)) {
4823  KMP_DEBUG_ASSERT(f == n_th);
4824  first--;
4825  }
4826  if (last == masters_place) {
4827  KMP_DEBUG_ASSERT(f == (n_th - 1));
4828  last--;
4829  }
4830  } else {
4831  KMP_DEBUG_ASSERT(f == n_th);
4832  first = 0;
4833  last = 0;
4834  }
4835  }
4836  if (last >= n_places) {
4837  last = (n_places - 1);
4838  }
4839  place = first;
4840  current += spacing;
4841  if (f < n_th) {
4842  KMP_DEBUG_ASSERT(0 <= first);
4843  KMP_DEBUG_ASSERT(n_places > first);
4844  KMP_DEBUG_ASSERT(0 <= last);
4845  KMP_DEBUG_ASSERT(n_places > last);
4846  KMP_DEBUG_ASSERT(last_place >= first_place);
4847  th = team->t.t_threads[f];
4848  KMP_DEBUG_ASSERT(th);
4849  th->th.th_first_place = first;
4850  th->th.th_new_place = place;
4851  th->th.th_last_place = last;
4852  if (__kmp_display_affinity && place != th->th.th_current_place &&
4853  team->t.t_display_affinity != 1) {
4854  team->t.t_display_affinity = 1;
4855  }
4856  KA_TRACE(100,
4857  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4858  "partition = [%d,%d], spacing = %.4f\n",
4859  __kmp_gtid_from_thread(team->t.t_threads[f]),
4860  team->t.t_id, f, th->th.th_new_place,
4861  th->th.th_first_place, th->th.th_last_place, spacing));
4862  }
4863  }
4864  }
4865  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4866  } else {
4867  int S, rem, gap, s_count;
4868  S = n_th / n_places;
4869  s_count = 0;
4870  rem = n_th - (S * n_places);
4871  gap = rem > 0 ? n_places / rem : n_places;
4872  int place = masters_place;
4873  int gap_ct = gap;
4874  thidx = n_th;
4875  if (update_master_only == 1)
4876  thidx = 1;
4877  for (f = 0; f < thidx; f++) {
4878  kmp_info_t *th = team->t.t_threads[f];
4879  KMP_DEBUG_ASSERT(th != NULL);
4880 
4881  th->th.th_first_place = place;
4882  th->th.th_last_place = place;
4883  th->th.th_new_place = place;
4884  if (__kmp_display_affinity && place != th->th.th_current_place &&
4885  team->t.t_display_affinity != 1) {
4886  team->t.t_display_affinity = 1;
4887  }
4888  s_count++;
4889 
4890  if ((s_count == S) && rem && (gap_ct == gap)) {
4891  // do nothing, add an extra thread to place on next iteration
4892  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4893  // we added an extra thread to this place; move on to next place
4894  if (place == last_place) {
4895  place = first_place;
4896  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4897  place = 0;
4898  } else {
4899  place++;
4900  }
4901  s_count = 0;
4902  gap_ct = 1;
4903  rem--;
4904  } else if (s_count == S) { // place is full; don't add extra thread
4905  if (place == last_place) {
4906  place = first_place;
4907  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4908  place = 0;
4909  } else {
4910  place++;
4911  }
4912  gap_ct++;
4913  s_count = 0;
4914  }
4915 
4916  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4917  "partition = [%d,%d]\n",
4918  __kmp_gtid_from_thread(team->t.t_threads[f]),
4919  team->t.t_id, f, th->th.th_new_place,
4920  th->th.th_first_place, th->th.th_last_place));
4921  }
4922  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4923  }
4924  } break;
4925 
4926  default:
4927  break;
4928  }
4929 
4930  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4931 }
4932 
4933 #endif // KMP_AFFINITY_SUPPORTED
4934 
4935 /* allocate a new team data structure to use. take one off of the free pool if
4936  available */
4937 kmp_team_t *
4938 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4939 #if OMPT_SUPPORT
4940  ompt_data_t ompt_parallel_data,
4941 #endif
4942  kmp_proc_bind_t new_proc_bind,
4943  kmp_internal_control_t *new_icvs,
4944  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4945  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4946  int f;
4947  kmp_team_t *team;
4948  int use_hot_team = !root->r.r_active;
4949  int level = 0;
4950 
4951  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4952  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4953  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4954  KMP_MB();
4955 
4956 #if KMP_NESTED_HOT_TEAMS
4957  kmp_hot_team_ptr_t *hot_teams;
4958  if (master) {
4959  team = master->th.th_team;
4960  level = team->t.t_active_level;
4961  if (master->th.th_teams_microtask) { // in teams construct?
4962  if (master->th.th_teams_size.nteams > 1 &&
4963  ( // #teams > 1
4964  team->t.t_pkfn ==
4965  (microtask_t)__kmp_teams_master || // inner fork of the teams
4966  master->th.th_teams_level <
4967  team->t.t_level)) { // or nested parallel inside the teams
4968  ++level; // not increment if #teams==1, or for outer fork of the teams;
4969  // increment otherwise
4970  }
4971  }
4972  hot_teams = master->th.th_hot_teams;
4973  if (level < __kmp_hot_teams_max_level && hot_teams &&
4974  hot_teams[level].hot_team) {
4975  // hot team has already been allocated for given level
4976  use_hot_team = 1;
4977  } else {
4978  use_hot_team = 0;
4979  }
4980  } else {
4981  // check we won't access uninitialized hot_teams, just in case
4982  KMP_DEBUG_ASSERT(new_nproc == 1);
4983  }
4984 #endif
4985  // Optimization to use a "hot" team
4986  if (use_hot_team && new_nproc > 1) {
4987  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4988 #if KMP_NESTED_HOT_TEAMS
4989  team = hot_teams[level].hot_team;
4990 #else
4991  team = root->r.r_hot_team;
4992 #endif
4993 #if KMP_DEBUG
4994  if (__kmp_tasking_mode != tskm_immediate_exec) {
4995  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4996  "task_team[1] = %p before reinit\n",
4997  team->t.t_task_team[0], team->t.t_task_team[1]));
4998  }
4999 #endif
5000 
5001  // Has the number of threads changed?
5002  /* Let's assume the most common case is that the number of threads is
5003  unchanged, and put that case first. */
5004  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5005  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5006  // This case can mean that omp_set_num_threads() was called and the hot
5007  // team size was already reduced, so we check the special flag
5008  if (team->t.t_size_changed == -1) {
5009  team->t.t_size_changed = 1;
5010  } else {
5011  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5012  }
5013 
5014  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5015  kmp_r_sched_t new_sched = new_icvs->sched;
5016  // set primary thread's schedule as new run-time schedule
5017  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5018 
5019  __kmp_reinitialize_team(team, new_icvs,
5020  root->r.r_uber_thread->th.th_ident);
5021 
5022  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5023  team->t.t_threads[0], team));
5024  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5025 
5026 #if KMP_AFFINITY_SUPPORTED
5027  if ((team->t.t_size_changed == 0) &&
5028  (team->t.t_proc_bind == new_proc_bind)) {
5029  if (new_proc_bind == proc_bind_spread) {
5030  __kmp_partition_places(
5031  team, 1); // add flag to update only master for spread
5032  }
5033  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5034  "proc_bind = %d, partition = [%d,%d]\n",
5035  team->t.t_id, new_proc_bind, team->t.t_first_place,
5036  team->t.t_last_place));
5037  } else {
5038  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5039  __kmp_partition_places(team);
5040  }
5041 #else
5042  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5043 #endif /* KMP_AFFINITY_SUPPORTED */
5044  } else if (team->t.t_nproc > new_nproc) {
5045  KA_TRACE(20,
5046  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5047  new_nproc));
5048 
5049  team->t.t_size_changed = 1;
5050 #if KMP_NESTED_HOT_TEAMS
5051  if (__kmp_hot_teams_mode == 0) {
5052  // AC: saved number of threads should correspond to team's value in this
5053  // mode, can be bigger in mode 1, when hot team has threads in reserve
5054  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5055  hot_teams[level].hot_team_nth = new_nproc;
5056 #endif // KMP_NESTED_HOT_TEAMS
5057  /* release the extra threads we don't need any more */
5058  for (f = new_nproc; f < team->t.t_nproc; f++) {
5059  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5060  if (__kmp_tasking_mode != tskm_immediate_exec) {
5061  // When decreasing team size, threads no longer in the team should
5062  // unref task team.
5063  team->t.t_threads[f]->th.th_task_team = NULL;
5064  }
5065  __kmp_free_thread(team->t.t_threads[f]);
5066  team->t.t_threads[f] = NULL;
5067  }
5068 #if KMP_NESTED_HOT_TEAMS
5069  } // (__kmp_hot_teams_mode == 0)
5070  else {
5071  // When keeping extra threads in team, switch threads to wait on own
5072  // b_go flag
5073  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5074  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5075  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5076  for (int b = 0; b < bs_last_barrier; ++b) {
5077  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5078  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5079  }
5080  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5081  }
5082  }
5083  }
5084 #endif // KMP_NESTED_HOT_TEAMS
5085  team->t.t_nproc = new_nproc;
5086  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5087  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5088  __kmp_reinitialize_team(team, new_icvs,
5089  root->r.r_uber_thread->th.th_ident);
5090 
5091  // Update remaining threads
5092  for (f = 0; f < new_nproc; ++f) {
5093  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5094  }
5095 
5096  // restore the current task state of the primary thread: should be the
5097  // implicit task
5098  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5099  team->t.t_threads[0], team));
5100 
5101  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5102 
5103 #ifdef KMP_DEBUG
5104  for (f = 0; f < team->t.t_nproc; f++) {
5105  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5106  team->t.t_threads[f]->th.th_team_nproc ==
5107  team->t.t_nproc);
5108  }
5109 #endif
5110 
5111  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5112 #if KMP_AFFINITY_SUPPORTED
5113  __kmp_partition_places(team);
5114 #endif
5115  } else { // team->t.t_nproc < new_nproc
5116 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5117  kmp_affin_mask_t *old_mask;
5118  if (KMP_AFFINITY_CAPABLE()) {
5119  KMP_CPU_ALLOC(old_mask);
5120  }
5121 #endif
5122 
5123  KA_TRACE(20,
5124  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5125  new_nproc));
5126 
5127  team->t.t_size_changed = 1;
5128 
5129 #if KMP_NESTED_HOT_TEAMS
5130  int avail_threads = hot_teams[level].hot_team_nth;
5131  if (new_nproc < avail_threads)
5132  avail_threads = new_nproc;
5133  kmp_info_t **other_threads = team->t.t_threads;
5134  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5135  // Adjust barrier data of reserved threads (if any) of the team
5136  // Other data will be set in __kmp_initialize_info() below.
5137  int b;
5138  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5139  for (b = 0; b < bs_last_barrier; ++b) {
5140  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5141  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5142 #if USE_DEBUGGER
5143  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5144 #endif
5145  }
5146  }
5147  if (hot_teams[level].hot_team_nth >= new_nproc) {
5148  // we have all needed threads in reserve, no need to allocate any
5149  // this only possible in mode 1, cannot have reserved threads in mode 0
5150  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5151  team->t.t_nproc = new_nproc; // just get reserved threads involved
5152  } else {
5153  // we may have some threads in reserve, but not enough
5154  team->t.t_nproc =
5155  hot_teams[level]
5156  .hot_team_nth; // get reserved threads involved if any
5157  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5158 #endif // KMP_NESTED_HOT_TEAMS
5159  if (team->t.t_max_nproc < new_nproc) {
5160  /* reallocate larger arrays */
5161  __kmp_reallocate_team_arrays(team, new_nproc);
5162  __kmp_reinitialize_team(team, new_icvs, NULL);
5163  }
5164 
5165 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5166  /* Temporarily set full mask for primary thread before creation of
5167  workers. The reason is that workers inherit the affinity from the
5168  primary thread, so if a lot of workers are created on the single
5169  core quickly, they don't get a chance to set their own affinity for
5170  a long time. */
5171  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5172 #endif
5173 
5174  /* allocate new threads for the hot team */
5175  for (f = team->t.t_nproc; f < new_nproc; f++) {
5176  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5177  KMP_DEBUG_ASSERT(new_worker);
5178  team->t.t_threads[f] = new_worker;
5179 
5180  KA_TRACE(20,
5181  ("__kmp_allocate_team: team %d init T#%d arrived: "
5182  "join=%llu, plain=%llu\n",
5183  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5184  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5185  team->t.t_bar[bs_plain_barrier].b_arrived));
5186 
5187  { // Initialize barrier data for new threads.
5188  int b;
5189  kmp_balign_t *balign = new_worker->th.th_bar;
5190  for (b = 0; b < bs_last_barrier; ++b) {
5191  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5192  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5193  KMP_BARRIER_PARENT_FLAG);
5194 #if USE_DEBUGGER
5195  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5196 #endif
5197  }
5198  }
5199  }
5200 
5201 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5202  if (KMP_AFFINITY_CAPABLE()) {
5203  /* Restore initial primary thread's affinity mask */
5204  __kmp_set_system_affinity(old_mask, TRUE);
5205  KMP_CPU_FREE(old_mask);
5206  }
5207 #endif
5208 #if KMP_NESTED_HOT_TEAMS
5209  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5210 #endif // KMP_NESTED_HOT_TEAMS
5211  /* make sure everyone is syncronized */
5212  int old_nproc = team->t.t_nproc; // save old value and use to update only
5213  // new threads below
5214  __kmp_initialize_team(team, new_nproc, new_icvs,
5215  root->r.r_uber_thread->th.th_ident);
5216 
5217  /* reinitialize the threads */
5218  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5219  for (f = 0; f < team->t.t_nproc; ++f)
5220  __kmp_initialize_info(team->t.t_threads[f], team, f,
5221  __kmp_gtid_from_tid(f, team));
5222 
5223  if (level) { // set th_task_state for new threads in nested hot team
5224  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5225  // only need to set the th_task_state for the new threads. th_task_state
5226  // for primary thread will not be accurate until after this in
5227  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5228  // get the correct value.
5229  for (f = old_nproc; f < team->t.t_nproc; ++f)
5230  team->t.t_threads[f]->th.th_task_state =
5231  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5232  } else { // set th_task_state for new threads in non-nested hot team
5233  // copy primary thread's state
5234  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5235  for (f = old_nproc; f < team->t.t_nproc; ++f)
5236  team->t.t_threads[f]->th.th_task_state = old_state;
5237  }
5238 
5239 #ifdef KMP_DEBUG
5240  for (f = 0; f < team->t.t_nproc; ++f) {
5241  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5242  team->t.t_threads[f]->th.th_team_nproc ==
5243  team->t.t_nproc);
5244  }
5245 #endif
5246 
5247  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5248 #if KMP_AFFINITY_SUPPORTED
5249  __kmp_partition_places(team);
5250 #endif
5251  } // Check changes in number of threads
5252 
5253  kmp_info_t *master = team->t.t_threads[0];
5254  if (master->th.th_teams_microtask) {
5255  for (f = 1; f < new_nproc; ++f) {
5256  // propagate teams construct specific info to workers
5257  kmp_info_t *thr = team->t.t_threads[f];
5258  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5259  thr->th.th_teams_level = master->th.th_teams_level;
5260  thr->th.th_teams_size = master->th.th_teams_size;
5261  }
5262  }
5263 #if KMP_NESTED_HOT_TEAMS
5264  if (level) {
5265  // Sync barrier state for nested hot teams, not needed for outermost hot
5266  // team.
5267  for (f = 1; f < new_nproc; ++f) {
5268  kmp_info_t *thr = team->t.t_threads[f];
5269  int b;
5270  kmp_balign_t *balign = thr->th.th_bar;
5271  for (b = 0; b < bs_last_barrier; ++b) {
5272  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5273  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5274 #if USE_DEBUGGER
5275  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5276 #endif
5277  }
5278  }
5279  }
5280 #endif // KMP_NESTED_HOT_TEAMS
5281 
5282  /* reallocate space for arguments if necessary */
5283  __kmp_alloc_argv_entries(argc, team, TRUE);
5284  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5285  // The hot team re-uses the previous task team,
5286  // if untouched during the previous release->gather phase.
5287 
5288  KF_TRACE(10, (" hot_team = %p\n", team));
5289 
5290 #if KMP_DEBUG
5291  if (__kmp_tasking_mode != tskm_immediate_exec) {
5292  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5293  "task_team[1] = %p after reinit\n",
5294  team->t.t_task_team[0], team->t.t_task_team[1]));
5295  }
5296 #endif
5297 
5298 #if OMPT_SUPPORT
5299  __ompt_team_assign_id(team, ompt_parallel_data);
5300 #endif
5301 
5302  KMP_MB();
5303 
5304  return team;
5305  }
5306 
5307  /* next, let's try to take one from the team pool */
5308  KMP_MB();
5309  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5310  /* TODO: consider resizing undersized teams instead of reaping them, now
5311  that we have a resizing mechanism */
5312  if (team->t.t_max_nproc >= max_nproc) {
5313  /* take this team from the team pool */
5314  __kmp_team_pool = team->t.t_next_pool;
5315 
5316  /* setup the team for fresh use */
5317  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5318 
5319  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5320  "task_team[1] %p to NULL\n",
5321  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5322  team->t.t_task_team[0] = NULL;
5323  team->t.t_task_team[1] = NULL;
5324 
5325  /* reallocate space for arguments if necessary */
5326  __kmp_alloc_argv_entries(argc, team, TRUE);
5327  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5328 
5329  KA_TRACE(
5330  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5331  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5332  { // Initialize barrier data.
5333  int b;
5334  for (b = 0; b < bs_last_barrier; ++b) {
5335  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5336 #if USE_DEBUGGER
5337  team->t.t_bar[b].b_master_arrived = 0;
5338  team->t.t_bar[b].b_team_arrived = 0;
5339 #endif
5340  }
5341  }
5342 
5343  team->t.t_proc_bind = new_proc_bind;
5344 
5345  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5346  team->t.t_id));
5347 
5348 #if OMPT_SUPPORT
5349  __ompt_team_assign_id(team, ompt_parallel_data);
5350 #endif
5351 
5352  KMP_MB();
5353 
5354  return team;
5355  }
5356 
5357  /* reap team if it is too small, then loop back and check the next one */
5358  // not sure if this is wise, but, will be redone during the hot-teams
5359  // rewrite.
5360  /* TODO: Use technique to find the right size hot-team, don't reap them */
5361  team = __kmp_reap_team(team);
5362  __kmp_team_pool = team;
5363  }
5364 
5365  /* nothing available in the pool, no matter, make a new team! */
5366  KMP_MB();
5367  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5368 
5369  /* and set it up */
5370  team->t.t_max_nproc = max_nproc;
5371  /* NOTE well, for some reason allocating one big buffer and dividing it up
5372  seems to really hurt performance a lot on the P4, so, let's not use this */
5373  __kmp_allocate_team_arrays(team, max_nproc);
5374 
5375  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5376  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5377 
5378  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5379  "%p to NULL\n",
5380  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5381  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5382  // memory, no need to duplicate
5383  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5384  // memory, no need to duplicate
5385 
5386  if (__kmp_storage_map) {
5387  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5388  }
5389 
5390  /* allocate space for arguments */
5391  __kmp_alloc_argv_entries(argc, team, FALSE);
5392  team->t.t_argc = argc;
5393 
5394  KA_TRACE(20,
5395  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5396  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5397  { // Initialize barrier data.
5398  int b;
5399  for (b = 0; b < bs_last_barrier; ++b) {
5400  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5401 #if USE_DEBUGGER
5402  team->t.t_bar[b].b_master_arrived = 0;
5403  team->t.t_bar[b].b_team_arrived = 0;
5404 #endif
5405  }
5406  }
5407 
5408  team->t.t_proc_bind = new_proc_bind;
5409 
5410 #if OMPT_SUPPORT
5411  __ompt_team_assign_id(team, ompt_parallel_data);
5412  team->t.ompt_serialized_team_info = NULL;
5413 #endif
5414 
5415  KMP_MB();
5416 
5417  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5418  team->t.t_id));
5419 
5420  return team;
5421 }
5422 
5423 /* TODO implement hot-teams at all levels */
5424 /* TODO implement lazy thread release on demand (disband request) */
5425 
5426 /* free the team. return it to the team pool. release all the threads
5427  * associated with it */
5428 void __kmp_free_team(kmp_root_t *root,
5429  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5430  int f;
5431  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5432  team->t.t_id));
5433 
5434  /* verify state */
5435  KMP_DEBUG_ASSERT(root);
5436  KMP_DEBUG_ASSERT(team);
5437  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5438  KMP_DEBUG_ASSERT(team->t.t_threads);
5439 
5440  int use_hot_team = team == root->r.r_hot_team;
5441 #if KMP_NESTED_HOT_TEAMS
5442  int level;
5443  kmp_hot_team_ptr_t *hot_teams;
5444  if (master) {
5445  level = team->t.t_active_level - 1;
5446  if (master->th.th_teams_microtask) { // in teams construct?
5447  if (master->th.th_teams_size.nteams > 1) {
5448  ++level; // level was not increased in teams construct for
5449  // team_of_masters
5450  }
5451  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5452  master->th.th_teams_level == team->t.t_level) {
5453  ++level; // level was not increased in teams construct for
5454  // team_of_workers before the parallel
5455  } // team->t.t_level will be increased inside parallel
5456  }
5457  hot_teams = master->th.th_hot_teams;
5458  if (level < __kmp_hot_teams_max_level) {
5459  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5460  use_hot_team = 1;
5461  }
5462  }
5463 #endif // KMP_NESTED_HOT_TEAMS
5464 
5465  /* team is done working */
5466  TCW_SYNC_PTR(team->t.t_pkfn,
5467  NULL); // Important for Debugging Support Library.
5468 #if KMP_OS_WINDOWS
5469  team->t.t_copyin_counter = 0; // init counter for possible reuse
5470 #endif
5471  // Do not reset pointer to parent team to NULL for hot teams.
5472 
5473  /* if we are non-hot team, release our threads */
5474  if (!use_hot_team) {
5475  if (__kmp_tasking_mode != tskm_immediate_exec) {
5476  // Wait for threads to reach reapable state
5477  for (f = 1; f < team->t.t_nproc; ++f) {
5478  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5479  kmp_info_t *th = team->t.t_threads[f];
5480  volatile kmp_uint32 *state = &th->th.th_reap_state;
5481  while (*state != KMP_SAFE_TO_REAP) {
5482 #if KMP_OS_WINDOWS
5483  // On Windows a thread can be killed at any time, check this
5484  DWORD ecode;
5485  if (!__kmp_is_thread_alive(th, &ecode)) {
5486  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5487  break;
5488  }
5489 #endif
5490  // first check if thread is sleeping
5491  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5492  if (fl.is_sleeping())
5493  fl.resume(__kmp_gtid_from_thread(th));
5494  KMP_CPU_PAUSE();
5495  }
5496  }
5497 
5498  // Delete task teams
5499  int tt_idx;
5500  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5501  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5502  if (task_team != NULL) {
5503  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5504  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5505  team->t.t_threads[f]->th.th_task_team = NULL;
5506  }
5507  KA_TRACE(
5508  20,
5509  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5510  __kmp_get_gtid(), task_team, team->t.t_id));
5511 #if KMP_NESTED_HOT_TEAMS
5512  __kmp_free_task_team(master, task_team);
5513 #endif
5514  team->t.t_task_team[tt_idx] = NULL;
5515  }
5516  }
5517  }
5518 
5519  // Reset pointer to parent team only for non-hot teams.
5520  team->t.t_parent = NULL;
5521  team->t.t_level = 0;
5522  team->t.t_active_level = 0;
5523 
5524  /* free the worker threads */
5525  for (f = 1; f < team->t.t_nproc; ++f) {
5526  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5527  __kmp_free_thread(team->t.t_threads[f]);
5528  team->t.t_threads[f] = NULL;
5529  }
5530 
5531  /* put the team back in the team pool */
5532  /* TODO limit size of team pool, call reap_team if pool too large */
5533  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5534  __kmp_team_pool = (volatile kmp_team_t *)team;
5535  } else { // Check if team was created for primary threads in teams construct
5536  // See if first worker is a CG root
5537  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5538  team->t.t_threads[1]->th.th_cg_roots);
5539  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5540  // Clean up the CG root nodes on workers so that this team can be re-used
5541  for (f = 1; f < team->t.t_nproc; ++f) {
5542  kmp_info_t *thr = team->t.t_threads[f];
5543  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5544  thr->th.th_cg_roots->cg_root == thr);
5545  // Pop current CG root off list
5546  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5547  thr->th.th_cg_roots = tmp->up;
5548  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5549  " up to node %p. cg_nthreads was %d\n",
5550  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5551  int i = tmp->cg_nthreads--;
5552  if (i == 1) {
5553  __kmp_free(tmp); // free CG if we are the last thread in it
5554  }
5555  // Restore current task's thread_limit from CG root
5556  if (thr->th.th_cg_roots)
5557  thr->th.th_current_task->td_icvs.thread_limit =
5558  thr->th.th_cg_roots->cg_thread_limit;
5559  }
5560  }
5561  }
5562 
5563  KMP_MB();
5564 }
5565 
5566 /* reap the team. destroy it, reclaim all its resources and free its memory */
5567 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5568  kmp_team_t *next_pool = team->t.t_next_pool;
5569 
5570  KMP_DEBUG_ASSERT(team);
5571  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5572  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5573  KMP_DEBUG_ASSERT(team->t.t_threads);
5574  KMP_DEBUG_ASSERT(team->t.t_argv);
5575 
5576  /* TODO clean the threads that are a part of this? */
5577 
5578  /* free stuff */
5579  __kmp_free_team_arrays(team);
5580  if (team->t.t_argv != &team->t.t_inline_argv[0])
5581  __kmp_free((void *)team->t.t_argv);
5582  __kmp_free(team);
5583 
5584  KMP_MB();
5585  return next_pool;
5586 }
5587 
5588 // Free the thread. Don't reap it, just place it on the pool of available
5589 // threads.
5590 //
5591 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5592 // binding for the affinity mechanism to be useful.
5593 //
5594 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5595 // However, we want to avoid a potential performance problem by always
5596 // scanning through the list to find the correct point at which to insert
5597 // the thread (potential N**2 behavior). To do this we keep track of the
5598 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5599 // With single-level parallelism, threads will always be added to the tail
5600 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5601 // parallelism, all bets are off and we may need to scan through the entire
5602 // free list.
5603 //
5604 // This change also has a potentially large performance benefit, for some
5605 // applications. Previously, as threads were freed from the hot team, they
5606 // would be placed back on the free list in inverse order. If the hot team
5607 // grew back to it's original size, then the freed thread would be placed
5608 // back on the hot team in reverse order. This could cause bad cache
5609 // locality problems on programs where the size of the hot team regularly
5610 // grew and shrunk.
5611 //
5612 // Now, for single-level parallelism, the OMP tid is always == gtid.
5613 void __kmp_free_thread(kmp_info_t *this_th) {
5614  int gtid;
5615  kmp_info_t **scan;
5616 
5617  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5618  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5619 
5620  KMP_DEBUG_ASSERT(this_th);
5621 
5622  // When moving thread to pool, switch thread to wait on own b_go flag, and
5623  // uninitialized (NULL team).
5624  int b;
5625  kmp_balign_t *balign = this_th->th.th_bar;
5626  for (b = 0; b < bs_last_barrier; ++b) {
5627  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5628  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5629  balign[b].bb.team = NULL;
5630  balign[b].bb.leaf_kids = 0;
5631  }
5632  this_th->th.th_task_state = 0;
5633  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5634 
5635  /* put thread back on the free pool */
5636  TCW_PTR(this_th->th.th_team, NULL);
5637  TCW_PTR(this_th->th.th_root, NULL);
5638  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5639 
5640  while (this_th->th.th_cg_roots) {
5641  this_th->th.th_cg_roots->cg_nthreads--;
5642  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5643  " %p of thread %p to %d\n",
5644  this_th, this_th->th.th_cg_roots,
5645  this_th->th.th_cg_roots->cg_root,
5646  this_th->th.th_cg_roots->cg_nthreads));
5647  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5648  if (tmp->cg_root == this_th) { // Thread is a cg_root
5649  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5650  KA_TRACE(
5651  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5652  this_th->th.th_cg_roots = tmp->up;
5653  __kmp_free(tmp);
5654  } else { // Worker thread
5655  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5656  __kmp_free(tmp);
5657  }
5658  this_th->th.th_cg_roots = NULL;
5659  break;
5660  }
5661  }
5662 
5663  /* If the implicit task assigned to this thread can be used by other threads
5664  * -> multiple threads can share the data and try to free the task at
5665  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5666  * with higher probability when hot team is disabled but can occurs even when
5667  * the hot team is enabled */
5668  __kmp_free_implicit_task(this_th);
5669  this_th->th.th_current_task = NULL;
5670 
5671  // If the __kmp_thread_pool_insert_pt is already past the new insert
5672  // point, then we need to re-scan the entire list.
5673  gtid = this_th->th.th_info.ds.ds_gtid;
5674  if (__kmp_thread_pool_insert_pt != NULL) {
5675  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5676  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5677  __kmp_thread_pool_insert_pt = NULL;
5678  }
5679  }
5680 
5681  // Scan down the list to find the place to insert the thread.
5682  // scan is the address of a link in the list, possibly the address of
5683  // __kmp_thread_pool itself.
5684  //
5685  // In the absence of nested parallelism, the for loop will have 0 iterations.
5686  if (__kmp_thread_pool_insert_pt != NULL) {
5687  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5688  } else {
5689  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5690  }
5691  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5692  scan = &((*scan)->th.th_next_pool))
5693  ;
5694 
5695  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5696  // to its address.
5697  TCW_PTR(this_th->th.th_next_pool, *scan);
5698  __kmp_thread_pool_insert_pt = *scan = this_th;
5699  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5700  (this_th->th.th_info.ds.ds_gtid <
5701  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5702  TCW_4(this_th->th.th_in_pool, TRUE);
5703  __kmp_suspend_initialize_thread(this_th);
5704  __kmp_lock_suspend_mx(this_th);
5705  if (this_th->th.th_active == TRUE) {
5706  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5707  this_th->th.th_active_in_pool = TRUE;
5708  }
5709 #if KMP_DEBUG
5710  else {
5711  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5712  }
5713 #endif
5714  __kmp_unlock_suspend_mx(this_th);
5715 
5716  TCW_4(__kmp_nth, __kmp_nth - 1);
5717 
5718 #ifdef KMP_ADJUST_BLOCKTIME
5719  /* Adjust blocktime back to user setting or default if necessary */
5720  /* Middle initialization might never have occurred */
5721  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5722  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5723  if (__kmp_nth <= __kmp_avail_proc) {
5724  __kmp_zero_bt = FALSE;
5725  }
5726  }
5727 #endif /* KMP_ADJUST_BLOCKTIME */
5728 
5729  KMP_MB();
5730 }
5731 
5732 /* ------------------------------------------------------------------------ */
5733 
5734 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5735 #if OMP_PROFILING_SUPPORT
5736  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5737  // TODO: add a configuration option for time granularity
5738  if (ProfileTraceFile)
5739  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5740 #endif
5741 
5742  int gtid = this_thr->th.th_info.ds.ds_gtid;
5743  /* void *stack_data;*/
5744  kmp_team_t **volatile pteam;
5745 
5746  KMP_MB();
5747  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5748 
5749  if (__kmp_env_consistency_check) {
5750  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5751  }
5752 
5753 #if OMPT_SUPPORT
5754  ompt_data_t *thread_data = nullptr;
5755  if (ompt_enabled.enabled) {
5756  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5757  *thread_data = ompt_data_none;
5758 
5759  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5760  this_thr->th.ompt_thread_info.wait_id = 0;
5761  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5762  this_thr->th.ompt_thread_info.parallel_flags = 0;
5763  if (ompt_enabled.ompt_callback_thread_begin) {
5764  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5765  ompt_thread_worker, thread_data);
5766  }
5767  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5768  }
5769 #endif
5770 
5771  /* This is the place where threads wait for work */
5772  while (!TCR_4(__kmp_global.g.g_done)) {
5773  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5774  KMP_MB();
5775 
5776  /* wait for work to do */
5777  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5778 
5779  /* No tid yet since not part of a team */
5780  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5781 
5782 #if OMPT_SUPPORT
5783  if (ompt_enabled.enabled) {
5784  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5785  }
5786 #endif
5787 
5788  pteam = &this_thr->th.th_team;
5789 
5790  /* have we been allocated? */
5791  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5792  /* we were just woken up, so run our new task */
5793  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5794  int rc;
5795  KA_TRACE(20,
5796  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5797  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5798  (*pteam)->t.t_pkfn));
5799 
5800  updateHWFPControl(*pteam);
5801 
5802 #if OMPT_SUPPORT
5803  if (ompt_enabled.enabled) {
5804  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5805  }
5806 #endif
5807 
5808  rc = (*pteam)->t.t_invoke(gtid);
5809  KMP_ASSERT(rc);
5810 
5811  KMP_MB();
5812  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5813  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5814  (*pteam)->t.t_pkfn));
5815  }
5816 #if OMPT_SUPPORT
5817  if (ompt_enabled.enabled) {
5818  /* no frame set while outside task */
5819  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5820 
5821  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5822  }
5823 #endif
5824  /* join barrier after parallel region */
5825  __kmp_join_barrier(gtid);
5826  }
5827  }
5828  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5829 
5830 #if OMPT_SUPPORT
5831  if (ompt_enabled.ompt_callback_thread_end) {
5832  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5833  }
5834 #endif
5835 
5836  this_thr->th.th_task_team = NULL;
5837  /* run the destructors for the threadprivate data for this thread */
5838  __kmp_common_destroy_gtid(gtid);
5839 
5840  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5841  KMP_MB();
5842 
5843 #if OMP_PROFILING_SUPPORT
5844  llvm::timeTraceProfilerFinishThread();
5845 #endif
5846  return this_thr;
5847 }
5848 
5849 /* ------------------------------------------------------------------------ */
5850 
5851 void __kmp_internal_end_dest(void *specific_gtid) {
5852  // Make sure no significant bits are lost
5853  int gtid;
5854  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
5855 
5856  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5857  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5858  * this is because 0 is reserved for the nothing-stored case */
5859 
5860  __kmp_internal_end_thread(gtid);
5861 }
5862 
5863 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5864 
5865 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5866  __kmp_internal_end_atexit();
5867 }
5868 
5869 #endif
5870 
5871 /* [Windows] josh: when the atexit handler is called, there may still be more
5872  than one thread alive */
5873 void __kmp_internal_end_atexit(void) {
5874  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5875  /* [Windows]
5876  josh: ideally, we want to completely shutdown the library in this atexit
5877  handler, but stat code that depends on thread specific data for gtid fails
5878  because that data becomes unavailable at some point during the shutdown, so
5879  we call __kmp_internal_end_thread instead. We should eventually remove the
5880  dependency on __kmp_get_specific_gtid in the stat code and use
5881  __kmp_internal_end_library to cleanly shutdown the library.
5882 
5883  // TODO: Can some of this comment about GVS be removed?
5884  I suspect that the offending stat code is executed when the calling thread
5885  tries to clean up a dead root thread's data structures, resulting in GVS
5886  code trying to close the GVS structures for that thread, but since the stat
5887  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5888  the calling thread is cleaning up itself instead of another thread, it get
5889  confused. This happens because allowing a thread to unregister and cleanup
5890  another thread is a recent modification for addressing an issue.
5891  Based on the current design (20050722), a thread may end up
5892  trying to unregister another thread only if thread death does not trigger
5893  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5894  thread specific data destructor function to detect thread death. For
5895  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5896  is nothing. Thus, the workaround is applicable only for Windows static
5897  stat library. */
5898  __kmp_internal_end_library(-1);
5899 #if KMP_OS_WINDOWS
5900  __kmp_close_console();
5901 #endif
5902 }
5903 
5904 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5905  // It is assumed __kmp_forkjoin_lock is acquired.
5906 
5907  int gtid;
5908 
5909  KMP_DEBUG_ASSERT(thread != NULL);
5910 
5911  gtid = thread->th.th_info.ds.ds_gtid;
5912 
5913  if (!is_root) {
5914  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5915  /* Assume the threads are at the fork barrier here */
5916  KA_TRACE(
5917  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5918  gtid));
5919  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5920  * (GEH) */
5921  ANNOTATE_HAPPENS_BEFORE(thread);
5922  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5923  thread);
5924  __kmp_release_64(&flag);
5925  }
5926 
5927  // Terminate OS thread.
5928  __kmp_reap_worker(thread);
5929 
5930  // The thread was killed asynchronously. If it was actively
5931  // spinning in the thread pool, decrement the global count.
5932  //
5933  // There is a small timing hole here - if the worker thread was just waking
5934  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5935  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5936  // the global counter might not get updated.
5937  //
5938  // Currently, this can only happen as the library is unloaded,
5939  // so there are no harmful side effects.
5940  if (thread->th.th_active_in_pool) {
5941  thread->th.th_active_in_pool = FALSE;
5942  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5943  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5944  }
5945  }
5946 
5947  __kmp_free_implicit_task(thread);
5948 
5949 // Free the fast memory for tasking
5950 #if USE_FAST_MEMORY
5951  __kmp_free_fast_memory(thread);
5952 #endif /* USE_FAST_MEMORY */
5953 
5954  __kmp_suspend_uninitialize_thread(thread);
5955 
5956  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5957  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5958 
5959  --__kmp_all_nth;
5960  // __kmp_nth was decremented when thread is added to the pool.
5961 
5962 #ifdef KMP_ADJUST_BLOCKTIME
5963  /* Adjust blocktime back to user setting or default if necessary */
5964  /* Middle initialization might never have occurred */
5965  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5966  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5967  if (__kmp_nth <= __kmp_avail_proc) {
5968  __kmp_zero_bt = FALSE;
5969  }
5970  }
5971 #endif /* KMP_ADJUST_BLOCKTIME */
5972 
5973  /* free the memory being used */
5974  if (__kmp_env_consistency_check) {
5975  if (thread->th.th_cons) {
5976  __kmp_free_cons_stack(thread->th.th_cons);
5977  thread->th.th_cons = NULL;
5978  }
5979  }
5980 
5981  if (thread->th.th_pri_common != NULL) {
5982  __kmp_free(thread->th.th_pri_common);
5983  thread->th.th_pri_common = NULL;
5984  }
5985 
5986  if (thread->th.th_task_state_memo_stack != NULL) {
5987  __kmp_free(thread->th.th_task_state_memo_stack);
5988  thread->th.th_task_state_memo_stack = NULL;
5989  }
5990 
5991 #if KMP_USE_BGET
5992  if (thread->th.th_local.bget_data != NULL) {
5993  __kmp_finalize_bget(thread);
5994  }
5995 #endif
5996 
5997 #if KMP_AFFINITY_SUPPORTED
5998  if (thread->th.th_affin_mask != NULL) {
5999  KMP_CPU_FREE(thread->th.th_affin_mask);
6000  thread->th.th_affin_mask = NULL;
6001  }
6002 #endif /* KMP_AFFINITY_SUPPORTED */
6003 
6004 #if KMP_USE_HIER_SCHED
6005  if (thread->th.th_hier_bar_data != NULL) {
6006  __kmp_free(thread->th.th_hier_bar_data);
6007  thread->th.th_hier_bar_data = NULL;
6008  }
6009 #endif
6010 
6011  __kmp_reap_team(thread->th.th_serial_team);
6012  thread->th.th_serial_team = NULL;
6013  __kmp_free(thread);
6014 
6015  KMP_MB();
6016 
6017 } // __kmp_reap_thread
6018 
6019 static void __kmp_internal_end(void) {
6020  int i;
6021 
6022  /* First, unregister the library */
6023  __kmp_unregister_library();
6024 
6025 #if KMP_OS_WINDOWS
6026  /* In Win static library, we can't tell when a root actually dies, so we
6027  reclaim the data structures for any root threads that have died but not
6028  unregistered themselves, in order to shut down cleanly.
6029  In Win dynamic library we also can't tell when a thread dies. */
6030  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6031 // dead roots
6032 #endif
6033 
6034  for (i = 0; i < __kmp_threads_capacity; i++)
6035  if (__kmp_root[i])
6036  if (__kmp_root[i]->r.r_active)
6037  break;
6038  KMP_MB(); /* Flush all pending memory write invalidates. */
6039  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6040 
6041  if (i < __kmp_threads_capacity) {
6042 #if KMP_USE_MONITOR
6043  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6044  KMP_MB(); /* Flush all pending memory write invalidates. */
6045 
6046  // Need to check that monitor was initialized before reaping it. If we are
6047  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6048  // __kmp_monitor will appear to contain valid data, but it is only valid in
6049  // the parent process, not the child.
6050  // New behavior (201008): instead of keying off of the flag
6051  // __kmp_init_parallel, the monitor thread creation is keyed off
6052  // of the new flag __kmp_init_monitor.
6053  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6054  if (TCR_4(__kmp_init_monitor)) {
6055  __kmp_reap_monitor(&__kmp_monitor);
6056  TCW_4(__kmp_init_monitor, 0);
6057  }
6058  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6059  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6060 #endif // KMP_USE_MONITOR
6061  } else {
6062 /* TODO move this to cleanup code */
6063 #ifdef KMP_DEBUG
6064  /* make sure that everything has properly ended */
6065  for (i = 0; i < __kmp_threads_capacity; i++) {
6066  if (__kmp_root[i]) {
6067  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6068  // there can be uber threads alive here
6069  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6070  }
6071  }
6072 #endif
6073 
6074  KMP_MB();
6075 
6076  // Reap the worker threads.
6077  // This is valid for now, but be careful if threads are reaped sooner.
6078  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6079  // Get the next thread from the pool.
6080  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6081  __kmp_thread_pool = thread->th.th_next_pool;
6082  // Reap it.
6083  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6084  thread->th.th_next_pool = NULL;
6085  thread->th.th_in_pool = FALSE;
6086  __kmp_reap_thread(thread, 0);
6087  }
6088  __kmp_thread_pool_insert_pt = NULL;
6089 
6090  // Reap teams.
6091  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6092  // Get the next team from the pool.
6093  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6094  __kmp_team_pool = team->t.t_next_pool;
6095  // Reap it.
6096  team->t.t_next_pool = NULL;
6097  __kmp_reap_team(team);
6098  }
6099 
6100  __kmp_reap_task_teams();
6101 
6102 #if KMP_OS_UNIX
6103  // Threads that are not reaped should not access any resources since they
6104  // are going to be deallocated soon, so the shutdown sequence should wait
6105  // until all threads either exit the final spin-waiting loop or begin
6106  // sleeping after the given blocktime.
6107  for (i = 0; i < __kmp_threads_capacity; i++) {
6108  kmp_info_t *thr = __kmp_threads[i];
6109  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6110  KMP_CPU_PAUSE();
6111  }
6112 #endif
6113 
6114  for (i = 0; i < __kmp_threads_capacity; ++i) {
6115  // TBD: Add some checking...
6116  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6117  }
6118 
6119  /* Make sure all threadprivate destructors get run by joining with all
6120  worker threads before resetting this flag */
6121  TCW_SYNC_4(__kmp_init_common, FALSE);
6122 
6123  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6124  KMP_MB();
6125 
6126 #if KMP_USE_MONITOR
6127  // See note above: One of the possible fixes for CQ138434 / CQ140126
6128  //
6129  // FIXME: push both code fragments down and CSE them?
6130  // push them into __kmp_cleanup() ?
6131  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6132  if (TCR_4(__kmp_init_monitor)) {
6133  __kmp_reap_monitor(&__kmp_monitor);
6134  TCW_4(__kmp_init_monitor, 0);
6135  }
6136  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6137  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6138 #endif
6139  } /* else !__kmp_global.t_active */
6140  TCW_4(__kmp_init_gtid, FALSE);
6141  KMP_MB(); /* Flush all pending memory write invalidates. */
6142 
6143  __kmp_cleanup();
6144 #if OMPT_SUPPORT
6145  ompt_fini();
6146 #endif
6147 }
6148 
6149 void __kmp_internal_end_library(int gtid_req) {
6150  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6151  /* this shouldn't be a race condition because __kmp_internal_end() is the
6152  only place to clear __kmp_serial_init */
6153  /* we'll check this later too, after we get the lock */
6154  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6155  // redundant, because the next check will work in any case.
6156  if (__kmp_global.g.g_abort) {
6157  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6158  /* TODO abort? */
6159  return;
6160  }
6161  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6162  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6163  return;
6164  }
6165 
6166  KMP_MB(); /* Flush all pending memory write invalidates. */
6167  /* find out who we are and what we should do */
6168  {
6169  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6170  KA_TRACE(
6171  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6172  if (gtid == KMP_GTID_SHUTDOWN) {
6173  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6174  "already shutdown\n"));
6175  return;
6176  } else if (gtid == KMP_GTID_MONITOR) {
6177  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6178  "registered, or system shutdown\n"));
6179  return;
6180  } else if (gtid == KMP_GTID_DNE) {
6181  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6182  "shutdown\n"));
6183  /* we don't know who we are, but we may still shutdown the library */
6184  } else if (KMP_UBER_GTID(gtid)) {
6185  /* unregister ourselves as an uber thread. gtid is no longer valid */
6186  if (__kmp_root[gtid]->r.r_active) {
6187  __kmp_global.g.g_abort = -1;
6188  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6189  __kmp_unregister_library();
6190  KA_TRACE(10,
6191  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6192  gtid));
6193  return;
6194  } else {
6195  KA_TRACE(
6196  10,
6197  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6198  __kmp_unregister_root_current_thread(gtid);
6199  }
6200  } else {
6201 /* worker threads may call this function through the atexit handler, if they
6202  * call exit() */
6203 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6204  TODO: do a thorough shutdown instead */
6205 #ifdef DUMP_DEBUG_ON_EXIT
6206  if (__kmp_debug_buf)
6207  __kmp_dump_debug_buffer();
6208 #endif
6209  // added unregister library call here when we switch to shm linux
6210  // if we don't, it will leave lots of files in /dev/shm
6211  // cleanup shared memory file before exiting.
6212  __kmp_unregister_library();
6213  return;
6214  }
6215  }
6216  /* synchronize the termination process */
6217  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6218 
6219  /* have we already finished */
6220  if (__kmp_global.g.g_abort) {
6221  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6222  /* TODO abort? */
6223  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6224  return;
6225  }
6226  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6227  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6228  return;
6229  }
6230 
6231  /* We need this lock to enforce mutex between this reading of
6232  __kmp_threads_capacity and the writing by __kmp_register_root.
6233  Alternatively, we can use a counter of roots that is atomically updated by
6234  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6235  __kmp_internal_end_*. */
6236  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6237 
6238  /* now we can safely conduct the actual termination */
6239  __kmp_internal_end();
6240 
6241  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6242  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6243 
6244  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6245 
6246 #ifdef DUMP_DEBUG_ON_EXIT
6247  if (__kmp_debug_buf)
6248  __kmp_dump_debug_buffer();
6249 #endif
6250 
6251 #if KMP_OS_WINDOWS
6252  __kmp_close_console();
6253 #endif
6254 
6255  __kmp_fini_allocator();
6256 
6257 } // __kmp_internal_end_library
6258 
6259 void __kmp_internal_end_thread(int gtid_req) {
6260  int i;
6261 
6262  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6263  /* this shouldn't be a race condition because __kmp_internal_end() is the
6264  * only place to clear __kmp_serial_init */
6265  /* we'll check this later too, after we get the lock */
6266  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6267  // redundant, because the next check will work in any case.
6268  if (__kmp_global.g.g_abort) {
6269  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6270  /* TODO abort? */
6271  return;
6272  }
6273  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6274  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6275  return;
6276  }
6277 
6278  // If hidden helper team has been initialized, we need to deinit it
6279  if (TCR_4(__kmp_init_hidden_helper)) {
6280  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6281  // First release the main thread to let it continue its work
6282  __kmp_hidden_helper_main_thread_release();
6283  // Wait until the hidden helper team has been destroyed
6284  __kmp_hidden_helper_threads_deinitz_wait();
6285  }
6286 
6287  KMP_MB(); /* Flush all pending memory write invalidates. */
6288 
6289  /* find out who we are and what we should do */
6290  {
6291  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6292  KA_TRACE(10,
6293  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6294  if (gtid == KMP_GTID_SHUTDOWN) {
6295  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6296  "already shutdown\n"));
6297  return;
6298  } else if (gtid == KMP_GTID_MONITOR) {
6299  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6300  "registered, or system shutdown\n"));
6301  return;
6302  } else if (gtid == KMP_GTID_DNE) {
6303  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6304  "shutdown\n"));
6305  return;
6306  /* we don't know who we are */
6307  } else if (KMP_UBER_GTID(gtid)) {
6308  /* unregister ourselves as an uber thread. gtid is no longer valid */
6309  if (__kmp_root[gtid]->r.r_active) {
6310  __kmp_global.g.g_abort = -1;
6311  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6312  KA_TRACE(10,
6313  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6314  gtid));
6315  return;
6316  } else {
6317  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6318  gtid));
6319  __kmp_unregister_root_current_thread(gtid);
6320  }
6321  } else {
6322  /* just a worker thread, let's leave */
6323  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6324 
6325  if (gtid >= 0) {
6326  __kmp_threads[gtid]->th.th_task_team = NULL;
6327  }
6328 
6329  KA_TRACE(10,
6330  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6331  gtid));
6332  return;
6333  }
6334  }
6335 #if KMP_DYNAMIC_LIB
6336  if (__kmp_pause_status != kmp_hard_paused)
6337  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6338  // because we will better shutdown later in the library destructor.
6339  {
6340  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6341  return;
6342  }
6343 #endif
6344  /* synchronize the termination process */
6345  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6346 
6347  /* have we already finished */
6348  if (__kmp_global.g.g_abort) {
6349  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6350  /* TODO abort? */
6351  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6352  return;
6353  }
6354  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6355  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6356  return;
6357  }
6358 
6359  /* We need this lock to enforce mutex between this reading of
6360  __kmp_threads_capacity and the writing by __kmp_register_root.
6361  Alternatively, we can use a counter of roots that is atomically updated by
6362  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6363  __kmp_internal_end_*. */
6364 
6365  /* should we finish the run-time? are all siblings done? */
6366  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6367 
6368  for (i = 0; i < __kmp_threads_capacity; ++i) {
6369  if (KMP_UBER_GTID(i)) {
6370  KA_TRACE(
6371  10,
6372  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6373  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6374  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6375  return;
6376  }
6377  }
6378 
6379  /* now we can safely conduct the actual termination */
6380 
6381  __kmp_internal_end();
6382 
6383  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6384  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6385 
6386  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6387 
6388 #ifdef DUMP_DEBUG_ON_EXIT
6389  if (__kmp_debug_buf)
6390  __kmp_dump_debug_buffer();
6391 #endif
6392 } // __kmp_internal_end_thread
6393 
6394 // -----------------------------------------------------------------------------
6395 // Library registration stuff.
6396 
6397 static long __kmp_registration_flag = 0;
6398 // Random value used to indicate library initialization.
6399 static char *__kmp_registration_str = NULL;
6400 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6401 
6402 static inline char *__kmp_reg_status_name() {
6403 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6404  each thread. If registration and unregistration go in different threads
6405  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6406  env var can not be found, because the name will contain different pid. */
6407 // macOS* complains about name being too long with additional getuid()
6408 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6409  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6410  (int)getuid());
6411 #else
6412  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6413 #endif
6414 } // __kmp_reg_status_get
6415 
6416 void __kmp_register_library_startup(void) {
6417 
6418  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6419  int done = 0;
6420  union {
6421  double dtime;
6422  long ltime;
6423  } time;
6424 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6425  __kmp_initialize_system_tick();
6426 #endif
6427  __kmp_read_system_time(&time.dtime);
6428  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6429  __kmp_registration_str =
6430  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6431  __kmp_registration_flag, KMP_LIBRARY_FILE);
6432 
6433  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6434  __kmp_registration_str));
6435 
6436  while (!done) {
6437 
6438  char *value = NULL; // Actual value of the environment variable.
6439 
6440 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6441  char *shm_name = __kmp_str_format("/%s", name);
6442  int shm_preexist = 0;
6443  char *data1;
6444  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6445  if ((fd1 == -1) && (errno == EEXIST)) {
6446  // file didn't open because it already exists.
6447  // try opening existing file
6448  fd1 = shm_open(shm_name, O_RDWR, 0666);
6449  if (fd1 == -1) { // file didn't open
6450  // error out here
6451  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6452  __kmp_msg_null);
6453  } else {
6454  // able to open existing file
6455  shm_preexist = 1;
6456  }
6457  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6458  // already exists.
6459  // error out here.
6460  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6461  __kmp_msg_null);
6462  }
6463  if (shm_preexist == 0) {
6464  // we created SHM now set size
6465  if (ftruncate(fd1, SHM_SIZE) == -1) {
6466  // error occured setting size;
6467  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6468  KMP_ERR(errno), __kmp_msg_null);
6469  }
6470  }
6471  data1 =
6472  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6473  if (data1 == MAP_FAILED) {
6474  // failed to map shared memory
6475  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6476  __kmp_msg_null);
6477  }
6478  if (shm_preexist == 0) { // set data to SHM, set value
6479  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6480  }
6481  // Read value from either what we just wrote or existing file.
6482  value = __kmp_str_format("%s", data1); // read value from SHM
6483  munmap(data1, SHM_SIZE);
6484  close(fd1);
6485 #else // Windows and unix with static library
6486  // Set environment variable, but do not overwrite if it is exist.
6487  __kmp_env_set(name, __kmp_registration_str, 0);
6488  // read value to see if it got set
6489  value = __kmp_env_get(name);
6490 #endif
6491 
6492  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6493  done = 1; // Ok, environment variable set successfully, exit the loop.
6494  } else {
6495  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6496  // Check whether it alive or dead.
6497  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6498  char *tail = value;
6499  char *flag_addr_str = NULL;
6500  char *flag_val_str = NULL;
6501  char const *file_name = NULL;
6502  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6503  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6504  file_name = tail;
6505  if (tail != NULL) {
6506  long *flag_addr = 0;
6507  unsigned long flag_val = 0;
6508  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6509  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6510  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6511  // First, check whether environment-encoded address is mapped into
6512  // addr space.
6513  // If so, dereference it to see if it still has the right value.
6514  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6515  neighbor = 1;
6516  } else {
6517  // If not, then we know the other copy of the library is no longer
6518  // running.
6519  neighbor = 2;
6520  }
6521  }
6522  }
6523  switch (neighbor) {
6524  case 0: // Cannot parse environment variable -- neighbor status unknown.
6525  // Assume it is the incompatible format of future version of the
6526  // library. Assume the other library is alive.
6527  // WARN( ... ); // TODO: Issue a warning.
6528  file_name = "unknown library";
6529  KMP_FALLTHROUGH();
6530  // Attention! Falling to the next case. That's intentional.
6531  case 1: { // Neighbor is alive.
6532  // Check it is allowed.
6533  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6534  if (!__kmp_str_match_true(duplicate_ok)) {
6535  // That's not allowed. Issue fatal error.
6536  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6537  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6538  }
6539  KMP_INTERNAL_FREE(duplicate_ok);
6540  __kmp_duplicate_library_ok = 1;
6541  done = 1; // Exit the loop.
6542  } break;
6543  case 2: { // Neighbor is dead.
6544 
6545 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6546  // close shared memory.
6547  shm_unlink(shm_name); // this removes file in /dev/shm
6548 #else
6549  // Clear the variable and try to register library again.
6550  __kmp_env_unset(name);
6551 #endif
6552  } break;
6553  default: {
6554  KMP_DEBUG_ASSERT(0);
6555  } break;
6556  }
6557  }
6558  KMP_INTERNAL_FREE((void *)value);
6559 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6560  KMP_INTERNAL_FREE((void *)shm_name);
6561 #endif
6562  } // while
6563  KMP_INTERNAL_FREE((void *)name);
6564 
6565 } // func __kmp_register_library_startup
6566 
6567 void __kmp_unregister_library(void) {
6568 
6569  char *name = __kmp_reg_status_name();
6570  char *value = NULL;
6571 
6572 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6573  char *shm_name = __kmp_str_format("/%s", name);
6574  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6575  if (fd1 == -1) {
6576  // file did not open. return.
6577  return;
6578  }
6579  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6580  if (data1 != MAP_FAILED) {
6581  value = __kmp_str_format("%s", data1); // read value from SHM
6582  munmap(data1, SHM_SIZE);
6583  }
6584  close(fd1);
6585 #else
6586  value = __kmp_env_get(name);
6587 #endif
6588 
6589  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6590  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6591  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6592 // Ok, this is our variable. Delete it.
6593 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6594  shm_unlink(shm_name); // this removes file in /dev/shm
6595 #else
6596  __kmp_env_unset(name);
6597 #endif
6598  }
6599 
6600 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6601  KMP_INTERNAL_FREE(shm_name);
6602 #endif
6603 
6604  KMP_INTERNAL_FREE(__kmp_registration_str);
6605  KMP_INTERNAL_FREE(value);
6606  KMP_INTERNAL_FREE(name);
6607 
6608  __kmp_registration_flag = 0;
6609  __kmp_registration_str = NULL;
6610 
6611 } // __kmp_unregister_library
6612 
6613 // End of Library registration stuff.
6614 // -----------------------------------------------------------------------------
6615 
6616 #if KMP_MIC_SUPPORTED
6617 
6618 static void __kmp_check_mic_type() {
6619  kmp_cpuid_t cpuid_state = {0};
6620  kmp_cpuid_t *cs_p = &cpuid_state;
6621  __kmp_x86_cpuid(1, 0, cs_p);
6622  // We don't support mic1 at the moment
6623  if ((cs_p->eax & 0xff0) == 0xB10) {
6624  __kmp_mic_type = mic2;
6625  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6626  __kmp_mic_type = mic3;
6627  } else {
6628  __kmp_mic_type = non_mic;
6629  }
6630 }
6631 
6632 #endif /* KMP_MIC_SUPPORTED */
6633 
6634 #if KMP_HAVE_UMWAIT
6635 static void __kmp_user_level_mwait_init() {
6636  struct kmp_cpuid buf;
6637  __kmp_x86_cpuid(7, 0, &buf);
6638  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6639  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6640  __kmp_umwait_enabled));
6641 }
6642 #elif KMP_HAVE_MWAIT
6643 #ifndef AT_INTELPHIUSERMWAIT
6644 // Spurious, non-existent value that should always fail to return anything.
6645 // Will be replaced with the correct value when we know that.
6646 #define AT_INTELPHIUSERMWAIT 10000
6647 #endif
6648 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6649 // earlier OS is used to build the RTL, we'll use the following internal
6650 // function when the entry is not found.
6651 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6652 unsigned long getauxval(unsigned long) { return 0; }
6653 
6654 static void __kmp_user_level_mwait_init() {
6655  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6656  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6657  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6658  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6659  if (__kmp_mic_type == mic3) {
6660  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6661  if ((res & 0x1) || __kmp_user_level_mwait) {
6662  __kmp_mwait_enabled = TRUE;
6663  if (__kmp_user_level_mwait) {
6664  KMP_INFORM(EnvMwaitWarn);
6665  }
6666  } else {
6667  __kmp_mwait_enabled = FALSE;
6668  }
6669  }
6670  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6671  "__kmp_mwait_enabled = %d\n",
6672  __kmp_mic_type, __kmp_mwait_enabled));
6673 }
6674 #endif /* KMP_HAVE_UMWAIT */
6675 
6676 static void __kmp_do_serial_initialize(void) {
6677  int i, gtid;
6678  size_t size;
6679 
6680  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6681 
6682  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6683  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6684  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6685  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6686  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6687 
6688 #if OMPT_SUPPORT
6689  ompt_pre_init();
6690 #endif
6691 
6692  __kmp_validate_locks();
6693 
6694  /* Initialize internal memory allocator */
6695  __kmp_init_allocator();
6696 
6697  /* Register the library startup via an environment variable and check to see
6698  whether another copy of the library is already registered. */
6699 
6700  __kmp_register_library_startup();
6701 
6702  /* TODO reinitialization of library */
6703  if (TCR_4(__kmp_global.g.g_done)) {
6704  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6705  }
6706 
6707  __kmp_global.g.g_abort = 0;
6708  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6709 
6710 /* initialize the locks */
6711 #if KMP_USE_ADAPTIVE_LOCKS
6712 #if KMP_DEBUG_ADAPTIVE_LOCKS
6713  __kmp_init_speculative_stats();
6714 #endif
6715 #endif
6716 #if KMP_STATS_ENABLED
6717  __kmp_stats_init();
6718 #endif
6719  __kmp_init_lock(&__kmp_global_lock);
6720  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6721  __kmp_init_lock(&__kmp_debug_lock);
6722  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6723  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6724  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6725  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6726  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6727  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6728  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6729  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6730  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6731  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6732  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6733  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6734  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6735  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6736  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6737 #if KMP_USE_MONITOR
6738  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6739 #endif
6740  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6741 
6742  /* conduct initialization and initial setup of configuration */
6743 
6744  __kmp_runtime_initialize();
6745 
6746 #if KMP_MIC_SUPPORTED
6747  __kmp_check_mic_type();
6748 #endif
6749 
6750 // Some global variable initialization moved here from kmp_env_initialize()
6751 #ifdef KMP_DEBUG
6752  kmp_diag = 0;
6753 #endif
6754  __kmp_abort_delay = 0;
6755 
6756  // From __kmp_init_dflt_team_nth()
6757  /* assume the entire machine will be used */
6758  __kmp_dflt_team_nth_ub = __kmp_xproc;
6759  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6760  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6761  }
6762  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6763  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6764  }
6765  __kmp_max_nth = __kmp_sys_max_nth;
6766  __kmp_cg_max_nth = __kmp_sys_max_nth;
6767  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6768  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6769  __kmp_teams_max_nth = __kmp_sys_max_nth;
6770  }
6771 
6772  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6773  // part
6774  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6775 #if KMP_USE_MONITOR
6776  __kmp_monitor_wakeups =
6777  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6778  __kmp_bt_intervals =
6779  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6780 #endif
6781  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6782  __kmp_library = library_throughput;
6783  // From KMP_SCHEDULE initialization
6784  __kmp_static = kmp_sch_static_balanced;
6785 // AC: do not use analytical here, because it is non-monotonous
6786 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6787 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6788 // need to repeat assignment
6789 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6790 // bit control and barrier method control parts
6791 #if KMP_FAST_REDUCTION_BARRIER
6792 #define kmp_reduction_barrier_gather_bb ((int)1)
6793 #define kmp_reduction_barrier_release_bb ((int)1)
6794 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6795 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6796 #endif // KMP_FAST_REDUCTION_BARRIER
6797  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6798  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6799  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6800  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6801  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6802 #if KMP_FAST_REDUCTION_BARRIER
6803  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6804  // lin_64 ): hyper,1
6805  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6806  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6807  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6808  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6809  }
6810 #endif // KMP_FAST_REDUCTION_BARRIER
6811  }
6812 #if KMP_FAST_REDUCTION_BARRIER
6813 #undef kmp_reduction_barrier_release_pat
6814 #undef kmp_reduction_barrier_gather_pat
6815 #undef kmp_reduction_barrier_release_bb
6816 #undef kmp_reduction_barrier_gather_bb
6817 #endif // KMP_FAST_REDUCTION_BARRIER
6818 #if KMP_MIC_SUPPORTED
6819  if (__kmp_mic_type == mic2) { // KNC
6820  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6821  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6822  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6823  1; // forkjoin release
6824  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6825  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6826  }
6827 #if KMP_FAST_REDUCTION_BARRIER
6828  if (__kmp_mic_type == mic2) { // KNC
6829  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6830  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6831  }
6832 #endif // KMP_FAST_REDUCTION_BARRIER
6833 #endif // KMP_MIC_SUPPORTED
6834 
6835 // From KMP_CHECKS initialization
6836 #ifdef KMP_DEBUG
6837  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6838 #else
6839  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6840 #endif
6841 
6842  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6843  __kmp_foreign_tp = TRUE;
6844 
6845  __kmp_global.g.g_dynamic = FALSE;
6846  __kmp_global.g.g_dynamic_mode = dynamic_default;
6847 
6848  __kmp_env_initialize(NULL);
6849 
6850 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6851  __kmp_user_level_mwait_init();
6852 #endif
6853 // Print all messages in message catalog for testing purposes.
6854 #ifdef KMP_DEBUG
6855  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6856  if (__kmp_str_match_true(val)) {
6857  kmp_str_buf_t buffer;
6858  __kmp_str_buf_init(&buffer);
6859  __kmp_i18n_dump_catalog(&buffer);
6860  __kmp_printf("%s", buffer.str);
6861  __kmp_str_buf_free(&buffer);
6862  }
6863  __kmp_env_free(&val);
6864 #endif
6865 
6866  __kmp_threads_capacity =
6867  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6868  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6869  __kmp_tp_capacity = __kmp_default_tp_capacity(
6870  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6871 
6872  // If the library is shut down properly, both pools must be NULL. Just in
6873  // case, set them to NULL -- some memory may leak, but subsequent code will
6874  // work even if pools are not freed.
6875  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6876  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6877  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6878  __kmp_thread_pool = NULL;
6879  __kmp_thread_pool_insert_pt = NULL;
6880  __kmp_team_pool = NULL;
6881 
6882  /* Allocate all of the variable sized records */
6883  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6884  * expandable */
6885  /* Since allocation is cache-aligned, just add extra padding at the end */
6886  size =
6887  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6888  CACHE_LINE;
6889  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6890  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6891  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6892 
6893  /* init thread counts */
6894  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6895  0); // Asserts fail if the library is reinitializing and
6896  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6897  __kmp_all_nth = 0;
6898  __kmp_nth = 0;
6899 
6900  /* setup the uber master thread and hierarchy */
6901  gtid = __kmp_register_root(TRUE);
6902  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6903  KMP_ASSERT(KMP_UBER_GTID(gtid));
6904  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6905 
6906  KMP_MB(); /* Flush all pending memory write invalidates. */
6907 
6908  __kmp_common_initialize();
6909 
6910 #if KMP_OS_UNIX
6911  /* invoke the child fork handler */
6912  __kmp_register_atfork();
6913 #endif
6914 
6915 #if !KMP_DYNAMIC_LIB
6916  {
6917  /* Invoke the exit handler when the program finishes, only for static
6918  library. For dynamic library, we already have _fini and DllMain. */
6919  int rc = atexit(__kmp_internal_end_atexit);
6920  if (rc != 0) {
6921  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6922  __kmp_msg_null);
6923  }
6924  }
6925 #endif
6926 
6927 #if KMP_HANDLE_SIGNALS
6928 #if KMP_OS_UNIX
6929  /* NOTE: make sure that this is called before the user installs their own
6930  signal handlers so that the user handlers are called first. this way they
6931  can return false, not call our handler, avoid terminating the library, and
6932  continue execution where they left off. */
6933  __kmp_install_signals(FALSE);
6934 #endif /* KMP_OS_UNIX */
6935 #if KMP_OS_WINDOWS
6936  __kmp_install_signals(TRUE);
6937 #endif /* KMP_OS_WINDOWS */
6938 #endif
6939 
6940  /* we have finished the serial initialization */
6941  __kmp_init_counter++;
6942 
6943  __kmp_init_serial = TRUE;
6944 
6945  if (__kmp_settings) {
6946  __kmp_env_print();
6947  }
6948 
6949  if (__kmp_display_env || __kmp_display_env_verbose) {
6950  __kmp_env_print_2();
6951  }
6952 
6953 #if OMPT_SUPPORT
6954  ompt_post_init();
6955 #endif
6956 
6957  KMP_MB();
6958 
6959  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6960 }
6961 
6962 void __kmp_serial_initialize(void) {
6963  if (__kmp_init_serial) {
6964  return;
6965  }
6966  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6967  if (__kmp_init_serial) {
6968  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6969  return;
6970  }
6971  __kmp_do_serial_initialize();
6972  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6973 }
6974 
6975 static void __kmp_do_middle_initialize(void) {
6976  int i, j;
6977  int prev_dflt_team_nth;
6978 
6979  if (!__kmp_init_serial) {
6980  __kmp_do_serial_initialize();
6981  }
6982 
6983  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6984 
6985  // Save the previous value for the __kmp_dflt_team_nth so that
6986  // we can avoid some reinitialization if it hasn't changed.
6987  prev_dflt_team_nth = __kmp_dflt_team_nth;
6988 
6989 #if KMP_AFFINITY_SUPPORTED
6990  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6991  // number of cores on the machine.
6992  __kmp_affinity_initialize();
6993 
6994  // Run through the __kmp_threads array and set the affinity mask
6995  // for each root thread that is currently registered with the RTL.
6996  for (i = 0; i < __kmp_threads_capacity; i++) {
6997  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6998  __kmp_affinity_set_init_mask(i, TRUE);
6999  }
7000  }
7001 #endif /* KMP_AFFINITY_SUPPORTED */
7002 
7003  KMP_ASSERT(__kmp_xproc > 0);
7004  if (__kmp_avail_proc == 0) {
7005  __kmp_avail_proc = __kmp_xproc;
7006  }
7007 
7008  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7009  // correct them now
7010  j = 0;
7011  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7012  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7013  __kmp_avail_proc;
7014  j++;
7015  }
7016 
7017  if (__kmp_dflt_team_nth == 0) {
7018 #ifdef KMP_DFLT_NTH_CORES
7019  // Default #threads = #cores
7020  __kmp_dflt_team_nth = __kmp_ncores;
7021  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7022  "__kmp_ncores (%d)\n",
7023  __kmp_dflt_team_nth));
7024 #else
7025  // Default #threads = #available OS procs
7026  __kmp_dflt_team_nth = __kmp_avail_proc;
7027  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7028  "__kmp_avail_proc(%d)\n",
7029  __kmp_dflt_team_nth));
7030 #endif /* KMP_DFLT_NTH_CORES */
7031  }
7032 
7033  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7034  __kmp_dflt_team_nth = KMP_MIN_NTH;
7035  }
7036  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7037  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7038  }
7039 
7040  // There's no harm in continuing if the following check fails,
7041  // but it indicates an error in the previous logic.
7042  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7043 
7044  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7045  // Run through the __kmp_threads array and set the num threads icv for each
7046  // root thread that is currently registered with the RTL (which has not
7047  // already explicitly set its nthreads-var with a call to
7048  // omp_set_num_threads()).
7049  for (i = 0; i < __kmp_threads_capacity; i++) {
7050  kmp_info_t *thread = __kmp_threads[i];
7051  if (thread == NULL)
7052  continue;
7053  if (thread->th.th_current_task->td_icvs.nproc != 0)
7054  continue;
7055 
7056  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7057  }
7058  }
7059  KA_TRACE(
7060  20,
7061  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7062  __kmp_dflt_team_nth));
7063 
7064 #ifdef KMP_ADJUST_BLOCKTIME
7065  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7066  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7067  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7068  if (__kmp_nth > __kmp_avail_proc) {
7069  __kmp_zero_bt = TRUE;
7070  }
7071  }
7072 #endif /* KMP_ADJUST_BLOCKTIME */
7073 
7074  /* we have finished middle initialization */
7075  TCW_SYNC_4(__kmp_init_middle, TRUE);
7076 
7077  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7078 }
7079 
7080 void __kmp_middle_initialize(void) {
7081  if (__kmp_init_middle) {
7082  return;
7083  }
7084  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7085  if (__kmp_init_middle) {
7086  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7087  return;
7088  }
7089  __kmp_do_middle_initialize();
7090  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7091 }
7092 
7093 void __kmp_parallel_initialize(void) {
7094  int gtid = __kmp_entry_gtid(); // this might be a new root
7095 
7096  /* synchronize parallel initialization (for sibling) */
7097  if (TCR_4(__kmp_init_parallel))
7098  return;
7099  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7100  if (TCR_4(__kmp_init_parallel)) {
7101  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7102  return;
7103  }
7104 
7105  /* TODO reinitialization after we have already shut down */
7106  if (TCR_4(__kmp_global.g.g_done)) {
7107  KA_TRACE(
7108  10,
7109  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7110  __kmp_infinite_loop();
7111  }
7112 
7113  /* jc: The lock __kmp_initz_lock is already held, so calling
7114  __kmp_serial_initialize would cause a deadlock. So we call
7115  __kmp_do_serial_initialize directly. */
7116  if (!__kmp_init_middle) {
7117  __kmp_do_middle_initialize();
7118  }
7119  __kmp_resume_if_hard_paused();
7120 
7121  /* begin initialization */
7122  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7123  KMP_ASSERT(KMP_UBER_GTID(gtid));
7124 
7125 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7126  // Save the FP control regs.
7127  // Worker threads will set theirs to these values at thread startup.
7128  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7129  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7130  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7131 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7132 
7133 #if KMP_OS_UNIX
7134 #if KMP_HANDLE_SIGNALS
7135  /* must be after __kmp_serial_initialize */
7136  __kmp_install_signals(TRUE);
7137 #endif
7138 #endif
7139 
7140  __kmp_suspend_initialize();
7141 
7142 #if defined(USE_LOAD_BALANCE)
7143  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7144  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7145  }
7146 #else
7147  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7148  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7149  }
7150 #endif
7151 
7152  if (__kmp_version) {
7153  __kmp_print_version_2();
7154  }
7155 
7156  /* we have finished parallel initialization */
7157  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7158 
7159  KMP_MB();
7160  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7161 
7162  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7163 }
7164 
7165 void __kmp_hidden_helper_initialize() {
7166  if (TCR_4(__kmp_init_hidden_helper))
7167  return;
7168 
7169  // __kmp_parallel_initialize is required before we initialize hidden helper
7170  if (!TCR_4(__kmp_init_parallel))
7171  __kmp_parallel_initialize();
7172 
7173  // Double check. Note that this double check should not be placed before
7174  // __kmp_parallel_initialize as it will cause dead lock.
7175  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7176  if (TCR_4(__kmp_init_hidden_helper)) {
7177  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7178  return;
7179  }
7180 
7181  // Set the count of hidden helper tasks to be executed to zero
7182  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7183 
7184  // Set the global variable indicating that we're initializing hidden helper
7185  // team/threads
7186  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7187 
7188  // Platform independent initialization
7189  __kmp_do_initialize_hidden_helper_threads();
7190 
7191  // Wait here for the finish of initialization of hidden helper teams
7192  __kmp_hidden_helper_threads_initz_wait();
7193 
7194  // We have finished hidden helper initialization
7195  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7196 
7197  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7198 }
7199 
7200 /* ------------------------------------------------------------------------ */
7201 
7202 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7203  kmp_team_t *team) {
7204  kmp_disp_t *dispatch;
7205 
7206  KMP_MB();
7207 
7208  /* none of the threads have encountered any constructs, yet. */
7209  this_thr->th.th_local.this_construct = 0;
7210 #if KMP_CACHE_MANAGE
7211  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7212 #endif /* KMP_CACHE_MANAGE */
7213  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7214  KMP_DEBUG_ASSERT(dispatch);
7215  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7216  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7217  // this_thr->th.th_info.ds.ds_tid ] );
7218 
7219  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7220  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7221  if (__kmp_env_consistency_check)
7222  __kmp_push_parallel(gtid, team->t.t_ident);
7223 
7224  KMP_MB(); /* Flush all pending memory write invalidates. */
7225 }
7226 
7227 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7228  kmp_team_t *team) {
7229  if (__kmp_env_consistency_check)
7230  __kmp_pop_parallel(gtid, team->t.t_ident);
7231 
7232  __kmp_finish_implicit_task(this_thr);
7233 }
7234 
7235 int __kmp_invoke_task_func(int gtid) {
7236  int rc;
7237  int tid = __kmp_tid_from_gtid(gtid);
7238  kmp_info_t *this_thr = __kmp_threads[gtid];
7239  kmp_team_t *team = this_thr->th.th_team;
7240 
7241  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7242 #if USE_ITT_BUILD
7243  if (__itt_stack_caller_create_ptr) {
7244  // inform ittnotify about entering user's code
7245  if (team->t.t_stack_id != NULL) {
7246  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7247  } else {
7248  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7249  __kmp_itt_stack_callee_enter(
7250  (__itt_caller)team->t.t_parent->t.t_stack_id);
7251  }
7252  }
7253 #endif /* USE_ITT_BUILD */
7254 #if INCLUDE_SSC_MARKS
7255  SSC_MARK_INVOKING();
7256 #endif
7257 
7258 #if OMPT_SUPPORT
7259  void *dummy;
7260  void **exit_frame_p;
7261  ompt_data_t *my_task_data;
7262  ompt_data_t *my_parallel_data;
7263  int ompt_team_size;
7264 
7265  if (ompt_enabled.enabled) {
7266  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7267  .ompt_task_info.frame.exit_frame.ptr);
7268  } else {
7269  exit_frame_p = &dummy;
7270  }
7271 
7272  my_task_data =
7273  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7274  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7275  if (ompt_enabled.ompt_callback_implicit_task) {
7276  ompt_team_size = team->t.t_nproc;
7277  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7278  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7279  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7280  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7281  }
7282 #endif
7283 
7284 #if KMP_STATS_ENABLED
7285  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7286  if (previous_state == stats_state_e::TEAMS_REGION) {
7287  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7288  } else {
7289  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7290  }
7291  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7292 #endif
7293 
7294  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7295  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7296 #if OMPT_SUPPORT
7297  ,
7298  exit_frame_p
7299 #endif
7300  );
7301 #if OMPT_SUPPORT
7302  *exit_frame_p = NULL;
7303  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7304 #endif
7305 
7306 #if KMP_STATS_ENABLED
7307  if (previous_state == stats_state_e::TEAMS_REGION) {
7308  KMP_SET_THREAD_STATE(previous_state);
7309  }
7310  KMP_POP_PARTITIONED_TIMER();
7311 #endif
7312 
7313 #if USE_ITT_BUILD
7314  if (__itt_stack_caller_create_ptr) {
7315  // inform ittnotify about leaving user's code
7316  if (team->t.t_stack_id != NULL) {
7317  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7318  } else {
7319  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7320  __kmp_itt_stack_callee_leave(
7321  (__itt_caller)team->t.t_parent->t.t_stack_id);
7322  }
7323  }
7324 #endif /* USE_ITT_BUILD */
7325  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7326 
7327  return rc;
7328 }
7329 
7330 void __kmp_teams_master(int gtid) {
7331  // This routine is called by all primary threads in teams construct
7332  kmp_info_t *thr = __kmp_threads[gtid];
7333  kmp_team_t *team = thr->th.th_team;
7334  ident_t *loc = team->t.t_ident;
7335  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7336  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7337  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7338  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7339  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7340 
7341  // This thread is a new CG root. Set up the proper variables.
7342  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7343  tmp->cg_root = thr; // Make thr the CG root
7344  // Init to thread limit stored when league primary threads were forked
7345  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7346  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7347  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7348  " cg_nthreads to 1\n",
7349  thr, tmp));
7350  tmp->up = thr->th.th_cg_roots;
7351  thr->th.th_cg_roots = tmp;
7352 
7353 // Launch league of teams now, but not let workers execute
7354 // (they hang on fork barrier until next parallel)
7355 #if INCLUDE_SSC_MARKS
7356  SSC_MARK_FORKING();
7357 #endif
7358  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7359  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7360  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7361 #if INCLUDE_SSC_MARKS
7362  SSC_MARK_JOINING();
7363 #endif
7364  // If the team size was reduced from the limit, set it to the new size
7365  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7366  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7367  // AC: last parameter "1" eliminates join barrier which won't work because
7368  // worker threads are in a fork barrier waiting for more parallel regions
7369  __kmp_join_call(loc, gtid
7370 #if OMPT_SUPPORT
7371  ,
7372  fork_context_intel
7373 #endif
7374  ,
7375  1);
7376 }
7377 
7378 int __kmp_invoke_teams_master(int gtid) {
7379  kmp_info_t *this_thr = __kmp_threads[gtid];
7380  kmp_team_t *team = this_thr->th.th_team;
7381 #if KMP_DEBUG
7382  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7383  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7384  (void *)__kmp_teams_master);
7385 #endif
7386  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7387 #if OMPT_SUPPORT
7388  int tid = __kmp_tid_from_gtid(gtid);
7389  ompt_data_t *task_data =
7390  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7391  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7392  if (ompt_enabled.ompt_callback_implicit_task) {
7393  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7394  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7395  ompt_task_initial);
7396  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7397  }
7398 #endif
7399  __kmp_teams_master(gtid);
7400 #if OMPT_SUPPORT
7401  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7402 #endif
7403  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7404  return 1;
7405 }
7406 
7407 /* this sets the requested number of threads for the next parallel region
7408  encountered by this team. since this should be enclosed in the forkjoin
7409  critical section it should avoid race conditions with asymmetrical nested
7410  parallelism */
7411 
7412 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7413  kmp_info_t *thr = __kmp_threads[gtid];
7414 
7415  if (num_threads > 0)
7416  thr->th.th_set_nproc = num_threads;
7417 }
7418 
7419 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7420  int num_threads) {
7421  KMP_DEBUG_ASSERT(thr);
7422  // Remember the number of threads for inner parallel regions
7423  if (!TCR_4(__kmp_init_middle))
7424  __kmp_middle_initialize(); // get internal globals calculated
7425  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7426  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7427 
7428  if (num_threads == 0) {
7429  if (__kmp_teams_thread_limit > 0) {
7430  num_threads = __kmp_teams_thread_limit;
7431  } else {
7432  num_threads = __kmp_avail_proc / num_teams;
7433  }
7434  // adjust num_threads w/o warning as it is not user setting
7435  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7436  // no thread_limit clause specified - do not change thread-limit-var ICV
7437  if (num_threads > __kmp_dflt_team_nth) {
7438  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7439  }
7440  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7441  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7442  } // prevent team size to exceed thread-limit-var
7443  if (num_teams * num_threads > __kmp_teams_max_nth) {
7444  num_threads = __kmp_teams_max_nth / num_teams;
7445  }
7446  if (num_threads == 0) {
7447  num_threads = 1;
7448  }
7449  } else {
7450  // This thread will be the primary thread of the league primary threads
7451  // Store new thread limit; old limit is saved in th_cg_roots list
7452  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7453  // num_threads = min(num_threads, nthreads-var)
7454  if (num_threads > __kmp_dflt_team_nth) {
7455  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7456  }
7457  if (num_teams * num_threads > __kmp_teams_max_nth) {
7458  int new_threads = __kmp_teams_max_nth / num_teams;
7459  if (new_threads == 0) {
7460  new_threads = 1;
7461  }
7462  if (new_threads != num_threads) {
7463  if (!__kmp_reserve_warn) { // user asked for too many threads
7464  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7465  __kmp_msg(kmp_ms_warning,
7466  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7467  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7468  }
7469  }
7470  num_threads = new_threads;
7471  }
7472  }
7473  thr->th.th_teams_size.nth = num_threads;
7474 }
7475 
7476 /* this sets the requested number of teams for the teams region and/or
7477  the number of threads for the next parallel region encountered */
7478 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7479  int num_threads) {
7480  kmp_info_t *thr = __kmp_threads[gtid];
7481  KMP_DEBUG_ASSERT(num_teams >= 0);
7482  KMP_DEBUG_ASSERT(num_threads >= 0);
7483 
7484  if (num_teams == 0) {
7485  if (__kmp_nteams > 0) {
7486  num_teams = __kmp_nteams;
7487  } else {
7488  num_teams = 1; // default number of teams is 1.
7489  }
7490  }
7491  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7492  if (!__kmp_reserve_warn) {
7493  __kmp_reserve_warn = 1;
7494  __kmp_msg(kmp_ms_warning,
7495  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7496  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7497  }
7498  num_teams = __kmp_teams_max_nth;
7499  }
7500  // Set number of teams (number of threads in the outer "parallel" of the
7501  // teams)
7502  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7503 
7504  __kmp_push_thread_limit(thr, num_teams, num_threads);
7505 }
7506 
7507 /* This sets the requested number of teams for the teams region and/or
7508  the number of threads for the next parallel region encountered */
7509 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7510  int num_teams_ub, int num_threads) {
7511  kmp_info_t *thr = __kmp_threads[gtid];
7512  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7513  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7514  KMP_DEBUG_ASSERT(num_threads >= 0);
7515 
7516  if (num_teams_lb > num_teams_ub) {
7517  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7518  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7519  }
7520 
7521  int num_teams = 1; // defalt number of teams is 1.
7522 
7523  if (num_teams_lb == 0 && num_teams_ub > 0)
7524  num_teams_lb = num_teams_ub;
7525 
7526  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7527  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7528  if (num_teams > __kmp_teams_max_nth) {
7529  if (!__kmp_reserve_warn) {
7530  __kmp_reserve_warn = 1;
7531  __kmp_msg(kmp_ms_warning,
7532  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7533  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7534  }
7535  num_teams = __kmp_teams_max_nth;
7536  }
7537  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7538  num_teams = num_teams_ub;
7539  } else { // num_teams_lb <= num_teams <= num_teams_ub
7540  if (num_threads == 0) {
7541  if (num_teams_ub > __kmp_teams_max_nth) {
7542  num_teams = num_teams_lb;
7543  } else {
7544  num_teams = num_teams_ub;
7545  }
7546  } else {
7547  num_teams = (num_threads > __kmp_teams_max_nth)
7548  ? num_teams
7549  : __kmp_teams_max_nth / num_threads;
7550  if (num_teams < num_teams_lb) {
7551  num_teams = num_teams_lb;
7552  } else if (num_teams > num_teams_ub) {
7553  num_teams = num_teams_ub;
7554  }
7555  }
7556  }
7557  // Set number of teams (number of threads in the outer "parallel" of the
7558  // teams)
7559  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7560 
7561  __kmp_push_thread_limit(thr, num_teams, num_threads);
7562 }
7563 
7564 // Set the proc_bind var to use in the following parallel region.
7565 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7566  kmp_info_t *thr = __kmp_threads[gtid];
7567  thr->th.th_set_proc_bind = proc_bind;
7568 }
7569 
7570 /* Launch the worker threads into the microtask. */
7571 
7572 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7573  kmp_info_t *this_thr = __kmp_threads[gtid];
7574 
7575 #ifdef KMP_DEBUG
7576  int f;
7577 #endif /* KMP_DEBUG */
7578 
7579  KMP_DEBUG_ASSERT(team);
7580  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7581  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7582  KMP_MB(); /* Flush all pending memory write invalidates. */
7583 
7584  team->t.t_construct = 0; /* no single directives seen yet */
7585  team->t.t_ordered.dt.t_value =
7586  0; /* thread 0 enters the ordered section first */
7587 
7588  /* Reset the identifiers on the dispatch buffer */
7589  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7590  if (team->t.t_max_nproc > 1) {
7591  int i;
7592  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7593  team->t.t_disp_buffer[i].buffer_index = i;
7594  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7595  }
7596  } else {
7597  team->t.t_disp_buffer[0].buffer_index = 0;
7598  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7599  }
7600 
7601  KMP_MB(); /* Flush all pending memory write invalidates. */
7602  KMP_ASSERT(this_thr->th.th_team == team);
7603 
7604 #ifdef KMP_DEBUG
7605  for (f = 0; f < team->t.t_nproc; f++) {
7606  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7607  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7608  }
7609 #endif /* KMP_DEBUG */
7610 
7611  /* release the worker threads so they may begin working */
7612  __kmp_fork_barrier(gtid, 0);
7613 }
7614 
7615 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7616  kmp_info_t *this_thr = __kmp_threads[gtid];
7617 
7618  KMP_DEBUG_ASSERT(team);
7619  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7620  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7621  KMP_MB(); /* Flush all pending memory write invalidates. */
7622 
7623  /* Join barrier after fork */
7624 
7625 #ifdef KMP_DEBUG
7626  if (__kmp_threads[gtid] &&
7627  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7628  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7629  __kmp_threads[gtid]);
7630  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7631  "team->t.t_nproc=%d\n",
7632  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7633  team->t.t_nproc);
7634  __kmp_print_structure();
7635  }
7636  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7637  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7638 #endif /* KMP_DEBUG */
7639 
7640  __kmp_join_barrier(gtid); /* wait for everyone */
7641 #if OMPT_SUPPORT
7642  if (ompt_enabled.enabled &&
7643  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7644  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7645  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7646  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7647 #if OMPT_OPTIONAL
7648  void *codeptr = NULL;
7649  if (KMP_MASTER_TID(ds_tid) &&
7650  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7651  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7652  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7653 
7654  if (ompt_enabled.ompt_callback_sync_region_wait) {
7655  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7656  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7657  codeptr);
7658  }
7659  if (ompt_enabled.ompt_callback_sync_region) {
7660  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7661  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7662  codeptr);
7663  }
7664 #endif
7665  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7666  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7667  ompt_scope_end, NULL, task_data, 0, ds_tid,
7668  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7669  }
7670  }
7671 #endif
7672 
7673  KMP_MB(); /* Flush all pending memory write invalidates. */
7674  KMP_ASSERT(this_thr->th.th_team == team);
7675 }
7676 
7677 /* ------------------------------------------------------------------------ */
7678 
7679 #ifdef USE_LOAD_BALANCE
7680 
7681 // Return the worker threads actively spinning in the hot team, if we
7682 // are at the outermost level of parallelism. Otherwise, return 0.
7683 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7684  int i;
7685  int retval;
7686  kmp_team_t *hot_team;
7687 
7688  if (root->r.r_active) {
7689  return 0;
7690  }
7691  hot_team = root->r.r_hot_team;
7692  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7693  return hot_team->t.t_nproc - 1; // Don't count primary thread
7694  }
7695 
7696  // Skip the primary thread - it is accounted for elsewhere.
7697  retval = 0;
7698  for (i = 1; i < hot_team->t.t_nproc; i++) {
7699  if (hot_team->t.t_threads[i]->th.th_active) {
7700  retval++;
7701  }
7702  }
7703  return retval;
7704 }
7705 
7706 // Perform an automatic adjustment to the number of
7707 // threads used by the next parallel region.
7708 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7709  int retval;
7710  int pool_active;
7711  int hot_team_active;
7712  int team_curr_active;
7713  int system_active;
7714 
7715  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7716  set_nproc));
7717  KMP_DEBUG_ASSERT(root);
7718  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7719  ->th.th_current_task->td_icvs.dynamic == TRUE);
7720  KMP_DEBUG_ASSERT(set_nproc > 1);
7721 
7722  if (set_nproc == 1) {
7723  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7724  return 1;
7725  }
7726 
7727  // Threads that are active in the thread pool, active in the hot team for this
7728  // particular root (if we are at the outer par level), and the currently
7729  // executing thread (to become the primary thread) are available to add to the
7730  // new team, but are currently contributing to the system load, and must be
7731  // accounted for.
7732  pool_active = __kmp_thread_pool_active_nth;
7733  hot_team_active = __kmp_active_hot_team_nproc(root);
7734  team_curr_active = pool_active + hot_team_active + 1;
7735 
7736  // Check the system load.
7737  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7738  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7739  "hot team active = %d\n",
7740  system_active, pool_active, hot_team_active));
7741 
7742  if (system_active < 0) {
7743  // There was an error reading the necessary info from /proc, so use the
7744  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7745  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7746  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7747  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7748 
7749  // Make this call behave like the thread limit algorithm.
7750  retval = __kmp_avail_proc - __kmp_nth +
7751  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7752  if (retval > set_nproc) {
7753  retval = set_nproc;
7754  }
7755  if (retval < KMP_MIN_NTH) {
7756  retval = KMP_MIN_NTH;
7757  }
7758 
7759  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7760  retval));
7761  return retval;
7762  }
7763 
7764  // There is a slight delay in the load balance algorithm in detecting new
7765  // running procs. The real system load at this instant should be at least as
7766  // large as the #active omp thread that are available to add to the team.
7767  if (system_active < team_curr_active) {
7768  system_active = team_curr_active;
7769  }
7770  retval = __kmp_avail_proc - system_active + team_curr_active;
7771  if (retval > set_nproc) {
7772  retval = set_nproc;
7773  }
7774  if (retval < KMP_MIN_NTH) {
7775  retval = KMP_MIN_NTH;
7776  }
7777 
7778  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7779  return retval;
7780 } // __kmp_load_balance_nproc()
7781 
7782 #endif /* USE_LOAD_BALANCE */
7783 
7784 /* ------------------------------------------------------------------------ */
7785 
7786 /* NOTE: this is called with the __kmp_init_lock held */
7787 void __kmp_cleanup(void) {
7788  int f;
7789 
7790  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7791 
7792  if (TCR_4(__kmp_init_parallel)) {
7793 #if KMP_HANDLE_SIGNALS
7794  __kmp_remove_signals();
7795 #endif
7796  TCW_4(__kmp_init_parallel, FALSE);
7797  }
7798 
7799  if (TCR_4(__kmp_init_middle)) {
7800 #if KMP_AFFINITY_SUPPORTED
7801  __kmp_affinity_uninitialize();
7802 #endif /* KMP_AFFINITY_SUPPORTED */
7803  __kmp_cleanup_hierarchy();
7804  TCW_4(__kmp_init_middle, FALSE);
7805  }
7806 
7807  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7808 
7809  if (__kmp_init_serial) {
7810  __kmp_runtime_destroy();
7811  __kmp_init_serial = FALSE;
7812  }
7813 
7814  __kmp_cleanup_threadprivate_caches();
7815 
7816  for (f = 0; f < __kmp_threads_capacity; f++) {
7817  if (__kmp_root[f] != NULL) {
7818  __kmp_free(__kmp_root[f]);
7819  __kmp_root[f] = NULL;
7820  }
7821  }
7822  __kmp_free(__kmp_threads);
7823  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7824  // there is no need in freeing __kmp_root.
7825  __kmp_threads = NULL;
7826  __kmp_root = NULL;
7827  __kmp_threads_capacity = 0;
7828 
7829 #if KMP_USE_DYNAMIC_LOCK
7830  __kmp_cleanup_indirect_user_locks();
7831 #else
7832  __kmp_cleanup_user_locks();
7833 #endif
7834 
7835 #if KMP_AFFINITY_SUPPORTED
7836  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7837  __kmp_cpuinfo_file = NULL;
7838 #endif /* KMP_AFFINITY_SUPPORTED */
7839 
7840 #if KMP_USE_ADAPTIVE_LOCKS
7841 #if KMP_DEBUG_ADAPTIVE_LOCKS
7842  __kmp_print_speculative_stats();
7843 #endif
7844 #endif
7845  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7846  __kmp_nested_nth.nth = NULL;
7847  __kmp_nested_nth.size = 0;
7848  __kmp_nested_nth.used = 0;
7849  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7850  __kmp_nested_proc_bind.bind_types = NULL;
7851  __kmp_nested_proc_bind.size = 0;
7852  __kmp_nested_proc_bind.used = 0;
7853  if (__kmp_affinity_format) {
7854  KMP_INTERNAL_FREE(__kmp_affinity_format);
7855  __kmp_affinity_format = NULL;
7856  }
7857 
7858  __kmp_i18n_catclose();
7859 
7860 #if KMP_USE_HIER_SCHED
7861  __kmp_hier_scheds.deallocate();
7862 #endif
7863 
7864 #if KMP_STATS_ENABLED
7865  __kmp_stats_fini();
7866 #endif
7867 
7868  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7869 }
7870 
7871 /* ------------------------------------------------------------------------ */
7872 
7873 int __kmp_ignore_mppbeg(void) {
7874  char *env;
7875 
7876  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7877  if (__kmp_str_match_false(env))
7878  return FALSE;
7879  }
7880  // By default __kmpc_begin() is no-op.
7881  return TRUE;
7882 }
7883 
7884 int __kmp_ignore_mppend(void) {
7885  char *env;
7886 
7887  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7888  if (__kmp_str_match_false(env))
7889  return FALSE;
7890  }
7891  // By default __kmpc_end() is no-op.
7892  return TRUE;
7893 }
7894 
7895 void __kmp_internal_begin(void) {
7896  int gtid;
7897  kmp_root_t *root;
7898 
7899  /* this is a very important step as it will register new sibling threads
7900  and assign these new uber threads a new gtid */
7901  gtid = __kmp_entry_gtid();
7902  root = __kmp_threads[gtid]->th.th_root;
7903  KMP_ASSERT(KMP_UBER_GTID(gtid));
7904 
7905  if (root->r.r_begin)
7906  return;
7907  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7908  if (root->r.r_begin) {
7909  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7910  return;
7911  }
7912 
7913  root->r.r_begin = TRUE;
7914 
7915  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7916 }
7917 
7918 /* ------------------------------------------------------------------------ */
7919 
7920 void __kmp_user_set_library(enum library_type arg) {
7921  int gtid;
7922  kmp_root_t *root;
7923  kmp_info_t *thread;
7924 
7925  /* first, make sure we are initialized so we can get our gtid */
7926 
7927  gtid = __kmp_entry_gtid();
7928  thread = __kmp_threads[gtid];
7929 
7930  root = thread->th.th_root;
7931 
7932  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7933  library_serial));
7934  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7935  thread */
7936  KMP_WARNING(SetLibraryIncorrectCall);
7937  return;
7938  }
7939 
7940  switch (arg) {
7941  case library_serial:
7942  thread->th.th_set_nproc = 0;
7943  set__nproc(thread, 1);
7944  break;
7945  case library_turnaround:
7946  thread->th.th_set_nproc = 0;
7947  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7948  : __kmp_dflt_team_nth_ub);
7949  break;
7950  case library_throughput:
7951  thread->th.th_set_nproc = 0;
7952  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7953  : __kmp_dflt_team_nth_ub);
7954  break;
7955  default:
7956  KMP_FATAL(UnknownLibraryType, arg);
7957  }
7958 
7959  __kmp_aux_set_library(arg);
7960 }
7961 
7962 void __kmp_aux_set_stacksize(size_t arg) {
7963  if (!__kmp_init_serial)
7964  __kmp_serial_initialize();
7965 
7966 #if KMP_OS_DARWIN
7967  if (arg & (0x1000 - 1)) {
7968  arg &= ~(0x1000 - 1);
7969  if (arg + 0x1000) /* check for overflow if we round up */
7970  arg += 0x1000;
7971  }
7972 #endif
7973  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7974 
7975  /* only change the default stacksize before the first parallel region */
7976  if (!TCR_4(__kmp_init_parallel)) {
7977  size_t value = arg; /* argument is in bytes */
7978 
7979  if (value < __kmp_sys_min_stksize)
7980  value = __kmp_sys_min_stksize;
7981  else if (value > KMP_MAX_STKSIZE)
7982  value = KMP_MAX_STKSIZE;
7983 
7984  __kmp_stksize = value;
7985 
7986  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7987  }
7988 
7989  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7990 }
7991 
7992 /* set the behaviour of the runtime library */
7993 /* TODO this can cause some odd behaviour with sibling parallelism... */
7994 void __kmp_aux_set_library(enum library_type arg) {
7995  __kmp_library = arg;
7996 
7997  switch (__kmp_library) {
7998  case library_serial: {
7999  KMP_INFORM(LibraryIsSerial);
8000  } break;
8001  case library_turnaround:
8002  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8003  __kmp_use_yield = 2; // only yield when oversubscribed
8004  break;
8005  case library_throughput:
8006  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8007  __kmp_dflt_blocktime = 200;
8008  break;
8009  default:
8010  KMP_FATAL(UnknownLibraryType, arg);
8011  }
8012 }
8013 
8014 /* Getting team information common for all team API */
8015 // Returns NULL if not in teams construct
8016 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8017  kmp_info_t *thr = __kmp_entry_thread();
8018  teams_serialized = 0;
8019  if (thr->th.th_teams_microtask) {
8020  kmp_team_t *team = thr->th.th_team;
8021  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8022  int ii = team->t.t_level;
8023  teams_serialized = team->t.t_serialized;
8024  int level = tlevel + 1;
8025  KMP_DEBUG_ASSERT(ii >= tlevel);
8026  while (ii > level) {
8027  for (teams_serialized = team->t.t_serialized;
8028  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8029  }
8030  if (team->t.t_serialized && (!teams_serialized)) {
8031  team = team->t.t_parent;
8032  continue;
8033  }
8034  if (ii > level) {
8035  team = team->t.t_parent;
8036  ii--;
8037  }
8038  }
8039  return team;
8040  }
8041  return NULL;
8042 }
8043 
8044 int __kmp_aux_get_team_num() {
8045  int serialized;
8046  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8047  if (team) {
8048  if (serialized > 1) {
8049  return 0; // teams region is serialized ( 1 team of 1 thread ).
8050  } else {
8051  return team->t.t_master_tid;
8052  }
8053  }
8054  return 0;
8055 }
8056 
8057 int __kmp_aux_get_num_teams() {
8058  int serialized;
8059  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8060  if (team) {
8061  if (serialized > 1) {
8062  return 1;
8063  } else {
8064  return team->t.t_parent->t.t_nproc;
8065  }
8066  }
8067  return 1;
8068 }
8069 
8070 /* ------------------------------------------------------------------------ */
8071 
8072 /*
8073  * Affinity Format Parser
8074  *
8075  * Field is in form of: %[[[0].]size]type
8076  * % and type are required (%% means print a literal '%')
8077  * type is either single char or long name surrounded by {},
8078  * e.g., N or {num_threads}
8079  * 0 => leading zeros
8080  * . => right justified when size is specified
8081  * by default output is left justified
8082  * size is the *minimum* field length
8083  * All other characters are printed as is
8084  *
8085  * Available field types:
8086  * L {thread_level} - omp_get_level()
8087  * n {thread_num} - omp_get_thread_num()
8088  * h {host} - name of host machine
8089  * P {process_id} - process id (integer)
8090  * T {thread_identifier} - native thread identifier (integer)
8091  * N {num_threads} - omp_get_num_threads()
8092  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8093  * a {thread_affinity} - comma separated list of integers or integer ranges
8094  * (values of affinity mask)
8095  *
8096  * Implementation-specific field types can be added
8097  * If a type is unknown, print "undefined"
8098  */
8099 
8100 // Structure holding the short name, long name, and corresponding data type
8101 // for snprintf. A table of these will represent the entire valid keyword
8102 // field types.
8103 typedef struct kmp_affinity_format_field_t {
8104  char short_name; // from spec e.g., L -> thread level
8105  const char *long_name; // from spec thread_level -> thread level
8106  char field_format; // data type for snprintf (typically 'd' or 's'
8107  // for integer or string)
8108 } kmp_affinity_format_field_t;
8109 
8110 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8111 #if KMP_AFFINITY_SUPPORTED
8112  {'A', "thread_affinity", 's'},
8113 #endif
8114  {'t', "team_num", 'd'},
8115  {'T', "num_teams", 'd'},
8116  {'L', "nesting_level", 'd'},
8117  {'n', "thread_num", 'd'},
8118  {'N', "num_threads", 'd'},
8119  {'a', "ancestor_tnum", 'd'},
8120  {'H', "host", 's'},
8121  {'P', "process_id", 'd'},
8122  {'i', "native_thread_id", 'd'}};
8123 
8124 // Return the number of characters it takes to hold field
8125 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8126  const char **ptr,
8127  kmp_str_buf_t *field_buffer) {
8128  int rc, format_index, field_value;
8129  const char *width_left, *width_right;
8130  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8131  static const int FORMAT_SIZE = 20;
8132  char format[FORMAT_SIZE] = {0};
8133  char absolute_short_name = 0;
8134 
8135  KMP_DEBUG_ASSERT(gtid >= 0);
8136  KMP_DEBUG_ASSERT(th);
8137  KMP_DEBUG_ASSERT(**ptr == '%');
8138  KMP_DEBUG_ASSERT(field_buffer);
8139 
8140  __kmp_str_buf_clear(field_buffer);
8141 
8142  // Skip the initial %
8143  (*ptr)++;
8144 
8145  // Check for %% first
8146  if (**ptr == '%') {
8147  __kmp_str_buf_cat(field_buffer, "%", 1);
8148  (*ptr)++; // skip over the second %
8149  return 1;
8150  }
8151 
8152  // Parse field modifiers if they are present
8153  pad_zeros = false;
8154  if (**ptr == '0') {
8155  pad_zeros = true;
8156  (*ptr)++; // skip over 0
8157  }
8158  right_justify = false;
8159  if (**ptr == '.') {
8160  right_justify = true;
8161  (*ptr)++; // skip over .
8162  }
8163  // Parse width of field: [width_left, width_right)
8164  width_left = width_right = NULL;
8165  if (**ptr >= '0' && **ptr <= '9') {
8166  width_left = *ptr;
8167  SKIP_DIGITS(*ptr);
8168  width_right = *ptr;
8169  }
8170 
8171  // Create the format for KMP_SNPRINTF based on flags parsed above
8172  format_index = 0;
8173  format[format_index++] = '%';
8174  if (!right_justify)
8175  format[format_index++] = '-';
8176  if (pad_zeros)
8177  format[format_index++] = '0';
8178  if (width_left && width_right) {
8179  int i = 0;
8180  // Only allow 8 digit number widths.
8181  // This also prevents overflowing format variable
8182  while (i < 8 && width_left < width_right) {
8183  format[format_index++] = *width_left;
8184  width_left++;
8185  i++;
8186  }
8187  }
8188 
8189  // Parse a name (long or short)
8190  // Canonicalize the name into absolute_short_name
8191  found_valid_name = false;
8192  parse_long_name = (**ptr == '{');
8193  if (parse_long_name)
8194  (*ptr)++; // skip initial left brace
8195  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8196  sizeof(__kmp_affinity_format_table[0]);
8197  ++i) {
8198  char short_name = __kmp_affinity_format_table[i].short_name;
8199  const char *long_name = __kmp_affinity_format_table[i].long_name;
8200  char field_format = __kmp_affinity_format_table[i].field_format;
8201  if (parse_long_name) {
8202  size_t length = KMP_STRLEN(long_name);
8203  if (strncmp(*ptr, long_name, length) == 0) {
8204  found_valid_name = true;
8205  (*ptr) += length; // skip the long name
8206  }
8207  } else if (**ptr == short_name) {
8208  found_valid_name = true;
8209  (*ptr)++; // skip the short name
8210  }
8211  if (found_valid_name) {
8212  format[format_index++] = field_format;
8213  format[format_index++] = '\0';
8214  absolute_short_name = short_name;
8215  break;
8216  }
8217  }
8218  if (parse_long_name) {
8219  if (**ptr != '}') {
8220  absolute_short_name = 0;
8221  } else {
8222  (*ptr)++; // skip over the right brace
8223  }
8224  }
8225 
8226  // Attempt to fill the buffer with the requested
8227  // value using snprintf within __kmp_str_buf_print()
8228  switch (absolute_short_name) {
8229  case 't':
8230  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8231  break;
8232  case 'T':
8233  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8234  break;
8235  case 'L':
8236  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8237  break;
8238  case 'n':
8239  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8240  break;
8241  case 'H': {
8242  static const int BUFFER_SIZE = 256;
8243  char buf[BUFFER_SIZE];
8244  __kmp_expand_host_name(buf, BUFFER_SIZE);
8245  rc = __kmp_str_buf_print(field_buffer, format, buf);
8246  } break;
8247  case 'P':
8248  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8249  break;
8250  case 'i':
8251  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8252  break;
8253  case 'N':
8254  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8255  break;
8256  case 'a':
8257  field_value =
8258  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8259  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8260  break;
8261 #if KMP_AFFINITY_SUPPORTED
8262  case 'A': {
8263  kmp_str_buf_t buf;
8264  __kmp_str_buf_init(&buf);
8265  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8266  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8267  __kmp_str_buf_free(&buf);
8268  } break;
8269 #endif
8270  default:
8271  // According to spec, If an implementation does not have info for field
8272  // type, then "undefined" is printed
8273  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8274  // Skip the field
8275  if (parse_long_name) {
8276  SKIP_TOKEN(*ptr);
8277  if (**ptr == '}')
8278  (*ptr)++;
8279  } else {
8280  (*ptr)++;
8281  }
8282  }
8283 
8284  KMP_ASSERT(format_index <= FORMAT_SIZE);
8285  return rc;
8286 }
8287 
8288 /*
8289  * Return number of characters needed to hold the affinity string
8290  * (not including null byte character)
8291  * The resultant string is printed to buffer, which the caller can then
8292  * handle afterwards
8293  */
8294 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8295  kmp_str_buf_t *buffer) {
8296  const char *parse_ptr;
8297  size_t retval;
8298  const kmp_info_t *th;
8299  kmp_str_buf_t field;
8300 
8301  KMP_DEBUG_ASSERT(buffer);
8302  KMP_DEBUG_ASSERT(gtid >= 0);
8303 
8304  __kmp_str_buf_init(&field);
8305  __kmp_str_buf_clear(buffer);
8306 
8307  th = __kmp_threads[gtid];
8308  retval = 0;
8309 
8310  // If format is NULL or zero-length string, then we use
8311  // affinity-format-var ICV
8312  parse_ptr = format;
8313  if (parse_ptr == NULL || *parse_ptr == '\0') {
8314  parse_ptr = __kmp_affinity_format;
8315  }
8316  KMP_DEBUG_ASSERT(parse_ptr);
8317 
8318  while (*parse_ptr != '\0') {
8319  // Parse a field
8320  if (*parse_ptr == '%') {
8321  // Put field in the buffer
8322  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8323  __kmp_str_buf_catbuf(buffer, &field);
8324  retval += rc;
8325  } else {
8326  // Put literal character in buffer
8327  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8328  retval++;
8329  parse_ptr++;
8330  }
8331  }
8332  __kmp_str_buf_free(&field);
8333  return retval;
8334 }
8335 
8336 // Displays the affinity string to stdout
8337 void __kmp_aux_display_affinity(int gtid, const char *format) {
8338  kmp_str_buf_t buf;
8339  __kmp_str_buf_init(&buf);
8340  __kmp_aux_capture_affinity(gtid, format, &buf);
8341  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8342  __kmp_str_buf_free(&buf);
8343 }
8344 
8345 /* ------------------------------------------------------------------------ */
8346 
8347 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8348  int blocktime = arg; /* argument is in milliseconds */
8349 #if KMP_USE_MONITOR
8350  int bt_intervals;
8351 #endif
8352  kmp_int8 bt_set;
8353 
8354  __kmp_save_internal_controls(thread);
8355 
8356  /* Normalize and set blocktime for the teams */
8357  if (blocktime < KMP_MIN_BLOCKTIME)
8358  blocktime = KMP_MIN_BLOCKTIME;
8359  else if (blocktime > KMP_MAX_BLOCKTIME)
8360  blocktime = KMP_MAX_BLOCKTIME;
8361 
8362  set__blocktime_team(thread->th.th_team, tid, blocktime);
8363  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8364 
8365 #if KMP_USE_MONITOR
8366  /* Calculate and set blocktime intervals for the teams */
8367  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8368 
8369  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8370  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8371 #endif
8372 
8373  /* Set whether blocktime has been set to "TRUE" */
8374  bt_set = TRUE;
8375 
8376  set__bt_set_team(thread->th.th_team, tid, bt_set);
8377  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8378 #if KMP_USE_MONITOR
8379  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8380  "bt_intervals=%d, monitor_updates=%d\n",
8381  __kmp_gtid_from_tid(tid, thread->th.th_team),
8382  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8383  __kmp_monitor_wakeups));
8384 #else
8385  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8386  __kmp_gtid_from_tid(tid, thread->th.th_team),
8387  thread->th.th_team->t.t_id, tid, blocktime));
8388 #endif
8389 }
8390 
8391 void __kmp_aux_set_defaults(char const *str, size_t len) {
8392  if (!__kmp_init_serial) {
8393  __kmp_serial_initialize();
8394  }
8395  __kmp_env_initialize(str);
8396 
8397  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8398  __kmp_env_print();
8399  }
8400 } // __kmp_aux_set_defaults
8401 
8402 /* ------------------------------------------------------------------------ */
8403 /* internal fast reduction routines */
8404 
8405 PACKED_REDUCTION_METHOD_T
8406 __kmp_determine_reduction_method(
8407  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8408  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8409  kmp_critical_name *lck) {
8410 
8411  // Default reduction method: critical construct ( lck != NULL, like in current
8412  // PAROPT )
8413  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8414  // can be selected by RTL
8415  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8416  // can be selected by RTL
8417  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8418  // among generated by PAROPT.
8419 
8420  PACKED_REDUCTION_METHOD_T retval;
8421 
8422  int team_size;
8423 
8424  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8425  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8426 
8427 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8428  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8429 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8430 
8431  retval = critical_reduce_block;
8432 
8433  // another choice of getting a team size (with 1 dynamic deference) is slower
8434  team_size = __kmp_get_team_num_threads(global_tid);
8435  if (team_size == 1) {
8436 
8437  retval = empty_reduce_block;
8438 
8439  } else {
8440 
8441  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8442 
8443 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8444  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8445 
8446 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8447  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8448 
8449  int teamsize_cutoff = 4;
8450 
8451 #if KMP_MIC_SUPPORTED
8452  if (__kmp_mic_type != non_mic) {
8453  teamsize_cutoff = 8;
8454  }
8455 #endif
8456  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8457  if (tree_available) {
8458  if (team_size <= teamsize_cutoff) {
8459  if (atomic_available) {
8460  retval = atomic_reduce_block;
8461  }
8462  } else {
8463  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8464  }
8465  } else if (atomic_available) {
8466  retval = atomic_reduce_block;
8467  }
8468 #else
8469 #error "Unknown or unsupported OS"
8470 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8471  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8472 
8473 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8474 
8475 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8476 
8477  // basic tuning
8478 
8479  if (atomic_available) {
8480  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8481  retval = atomic_reduce_block;
8482  }
8483  } // otherwise: use critical section
8484 
8485 #elif KMP_OS_DARWIN
8486 
8487  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8488  if (atomic_available && (num_vars <= 3)) {
8489  retval = atomic_reduce_block;
8490  } else if (tree_available) {
8491  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8492  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8493  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8494  }
8495  } // otherwise: use critical section
8496 
8497 #else
8498 #error "Unknown or unsupported OS"
8499 #endif
8500 
8501 #else
8502 #error "Unknown or unsupported architecture"
8503 #endif
8504  }
8505 
8506  // KMP_FORCE_REDUCTION
8507 
8508  // If the team is serialized (team_size == 1), ignore the forced reduction
8509  // method and stay with the unsynchronized method (empty_reduce_block)
8510  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8511  team_size != 1) {
8512 
8513  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8514 
8515  int atomic_available, tree_available;
8516 
8517  switch ((forced_retval = __kmp_force_reduction_method)) {
8518  case critical_reduce_block:
8519  KMP_ASSERT(lck); // lck should be != 0
8520  break;
8521 
8522  case atomic_reduce_block:
8523  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8524  if (!atomic_available) {
8525  KMP_WARNING(RedMethodNotSupported, "atomic");
8526  forced_retval = critical_reduce_block;
8527  }
8528  break;
8529 
8530  case tree_reduce_block:
8531  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8532  if (!tree_available) {
8533  KMP_WARNING(RedMethodNotSupported, "tree");
8534  forced_retval = critical_reduce_block;
8535  } else {
8536 #if KMP_FAST_REDUCTION_BARRIER
8537  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8538 #endif
8539  }
8540  break;
8541 
8542  default:
8543  KMP_ASSERT(0); // "unsupported method specified"
8544  }
8545 
8546  retval = forced_retval;
8547  }
8548 
8549  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8550 
8551 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8552 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8553 
8554  return (retval);
8555 }
8556 // this function is for testing set/get/determine reduce method
8557 kmp_int32 __kmp_get_reduce_method(void) {
8558  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8559 }
8560 
8561 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8562 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8563 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8564 
8565 // Hard pause shuts down the runtime completely. Resume happens naturally when
8566 // OpenMP is used subsequently.
8567 void __kmp_hard_pause() {
8568  __kmp_pause_status = kmp_hard_paused;
8569  __kmp_internal_end_thread(-1);
8570 }
8571 
8572 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8573 void __kmp_resume_if_soft_paused() {
8574  if (__kmp_pause_status == kmp_soft_paused) {
8575  __kmp_pause_status = kmp_not_paused;
8576 
8577  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8578  kmp_info_t *thread = __kmp_threads[gtid];
8579  if (thread) { // Wake it if sleeping
8580  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8581  thread);
8582  if (fl.is_sleeping())
8583  fl.resume(gtid);
8584  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8585  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8586  } else { // thread holds the lock and may sleep soon
8587  do { // until either the thread sleeps, or we can get the lock
8588  if (fl.is_sleeping()) {
8589  fl.resume(gtid);
8590  break;
8591  } else if (__kmp_try_suspend_mx(thread)) {
8592  __kmp_unlock_suspend_mx(thread);
8593  break;
8594  }
8595  } while (1);
8596  }
8597  }
8598  }
8599  }
8600 }
8601 
8602 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8603 // TODO: add warning messages
8604 int __kmp_pause_resource(kmp_pause_status_t level) {
8605  if (level == kmp_not_paused) { // requesting resume
8606  if (__kmp_pause_status == kmp_not_paused) {
8607  // error message about runtime not being paused, so can't resume
8608  return 1;
8609  } else {
8610  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8611  __kmp_pause_status == kmp_hard_paused);
8612  __kmp_pause_status = kmp_not_paused;
8613  return 0;
8614  }
8615  } else if (level == kmp_soft_paused) { // requesting soft pause
8616  if (__kmp_pause_status != kmp_not_paused) {
8617  // error message about already being paused
8618  return 1;
8619  } else {
8620  __kmp_soft_pause();
8621  return 0;
8622  }
8623  } else if (level == kmp_hard_paused) { // requesting hard pause
8624  if (__kmp_pause_status != kmp_not_paused) {
8625  // error message about already being paused
8626  return 1;
8627  } else {
8628  __kmp_hard_pause();
8629  return 0;
8630  }
8631  } else {
8632  // error message about invalid level
8633  return 1;
8634  }
8635 }
8636 
8637 void __kmp_omp_display_env(int verbose) {
8638  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8639  if (__kmp_init_serial == 0)
8640  __kmp_do_serial_initialize();
8641  __kmp_display_env_impl(!verbose, verbose);
8642  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8643 }
8644 
8645 // Globals and functions for hidden helper task
8646 kmp_info_t **__kmp_hidden_helper_threads;
8647 kmp_info_t *__kmp_hidden_helper_main_thread;
8648 kmp_int32 __kmp_hidden_helper_threads_num = 8;
8649 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
8650 #if KMP_OS_LINUX
8651 kmp_int32 __kmp_enable_hidden_helper = TRUE;
8652 #else
8653 kmp_int32 __kmp_enable_hidden_helper = FALSE;
8654 #endif
8655 
8656 namespace {
8657 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
8658 
8659 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
8660  // This is an explicit synchronization on all hidden helper threads in case
8661  // that when a regular thread pushes a hidden helper task to one hidden
8662  // helper thread, the thread has not been awaken once since they're released
8663  // by the main thread after creating the team.
8664  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
8665  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
8666  __kmp_hidden_helper_threads_num)
8667  ;
8668 
8669  // If main thread, then wait for signal
8670  if (__kmpc_master(nullptr, *gtid)) {
8671  // First, unset the initial state and release the initial thread
8672  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
8673  __kmp_hidden_helper_initz_release();
8674  __kmp_hidden_helper_main_thread_wait();
8675  // Now wake up all worker threads
8676  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
8677  __kmp_hidden_helper_worker_thread_signal();
8678  }
8679  }
8680 }
8681 } // namespace
8682 
8683 void __kmp_hidden_helper_threads_initz_routine() {
8684  // Create a new root for hidden helper team/threads
8685  const int gtid = __kmp_register_root(TRUE);
8686  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
8687  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
8688  __kmp_hidden_helper_main_thread->th.th_set_nproc =
8689  __kmp_hidden_helper_threads_num;
8690 
8691  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
8692 
8693  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
8694 
8695  // Set the initialization flag to FALSE
8696  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
8697 
8698  __kmp_hidden_helper_threads_deinitz_release();
8699 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:194
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:933
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:891
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:351
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:358
@ kmp_sch_static
Definition: kmp.h:354
@ kmp_sch_guided_chunked
Definition: kmp.h:356
Definition: kmp.h:229
kmp_int32 flags
Definition: kmp.h:231