aboutsummaryrefslogtreecommitdiffstats
path: root/meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch
diff options
context:
space:
mode:
authorRichard Purdie <rpurdie@linux.intel.com>2010-08-27 15:14:24 +0100
committerRichard Purdie <rpurdie@linux.intel.com>2010-08-27 15:29:45 +0100
commit29d6678fd546377459ef75cf54abeef5b969b5cf (patch)
tree8edd65790e37a00d01c3f203f773fe4b5012db18 /meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch
parentda49de6885ee1bc424e70bc02f21f6ab920efb55 (diff)
downloadopenembedded-core-contrib-29d6678fd546377459ef75cf54abeef5b969b5cf.tar.gz
Major layout change to the packages directory
Having one monolithic packages directory makes it hard to find things and is generally overwhelming. This commit splits it into several logical sections roughly based on function, recipes.txt gives more information about the classifications used. The opportunity is also used to switch from "packages" to "recipes" as used in OpenEmbedded as the term "packages" can be confusing to people and has many different meanings. Not all recipes have been classified yet, this is just a first pass at separating things out. Some packages are moved to meta-extras as they're no longer actively used or maintained. Signed-off-by: Richard Purdie <rpurdie@linux.intel.com>
Diffstat (limited to 'meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch')
-rw-r--r--meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch2797
1 files changed, 2797 insertions, 0 deletions
diff --git a/meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch b/meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch
new file mode 100644
index 0000000000..da85e556ec
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc-4.3.3/fedora/gcc43-libgomp-speedup.patch
@@ -0,0 +1,2797 @@
+2008-03-28 Jakub Jelinek <jakub@redhat.com>
+
+ * config/linux/sparc/futex.h (atomic_write_barrier): Fix membar
+ argument.
+
+2008-03-27 Jakub Jelinek <jakub@redhat.com>
+
+ * libgomp.h (struct gomp_team_state): Remove single_count field
+ ifndef HAVE_SYNC_BUILTINS.
+ (struct gomp_team): Likewise. Add work_share_list_free_lock
+ ifndef HAVE_SYNC_BUILTINS.
+ * team.c (gomp_new_team): If HAVE_SYNC_BUILTINS is not defined,
+ don't initialize single_count, but instead initialize
+ work_share_list_free_lock.
+ (free_team): Destroy work_share_list_free_lock ifndef
+ HAVE_SYNC_BUILTINS.
+ (gomp_team_start): Don't initialize ts.single_count ifndef
+ HAVE_SYNC_BUILTINS.
+ * work.c (alloc_work_share, free_work_share): Use
+ work_share_list_free_lock instead of atomic chaining ifndef
+ HAVE_SYNC_BUILTINS.
+
+2008-03-26 Jakub Jelinek <jakub@redhat.com>
+
+ * loop.c (gomp_loop_init): Fix GFS_DYNAMIC ws->mode setting.
+ * testsuite/libgomp.c/loop-4.c: New test.
+
+ * libgomp.h (struct gomp_team_state): Add single_count field.
+ (struct gomp_team): Likewise.
+ * team.c (gomp_new_team): Clear single_count.
+ (gomp_team_start): Likewise.
+ * single.c (GOMP_single_start): Rewritten if HAVE_SYNC_BUILTINS.
+
+2008-03-25 Jakub Jelinek <jakub@redhat.com>
+
+ * team.c (gomp_thread_start): Don't clear ts.static_trip here.
+ * loop.c (gomp_loop_static_start, gomp_loop_dynamic_start): Clear
+ ts.static_trip here.
+ * work.c (gomp_work_share_start): Don't clear ts.static_trip here.
+
+2008-03-21 Jakub Jelinek <jakub@redhat.com>
+
+ * libgomp.h: Include ptrlock.h.
+ (struct gomp_work_share): Reshuffle fields. Add next_alloc,
+ next_ws, next_free and inline_ordered_team_ids fields, change
+ ordered_team_ids into pointer from flexible array member.
+ (struct gomp_team_state): Add last_work_share field, remove
+ work_share_generation.
+ (struct gomp_team): Remove work_share_lock, generation_mask,
+ oldest_live_gen, num_live_gen and init_work_shares fields, add
+ work work_share_list_alloc, work_share_list_free and work_share_chunk
+ fields. Change work_shares from pointer to pointers into an array.
+ (gomp_new_team): New prototype.
+ (gomp_team_start): Change type of last argument.
+ (gomp_new_work_share): Removed.
+ (gomp_init_work_share, gomp_fini_work_share): New prototypes.
+ (gomp_work_share_init_done): New static inline.
+ * team.c (gomp_thread_start): Clear ts.last_work_share, don't clear
+ ts.work_share_generation.
+ (new_team): Removed.
+ (gomp_new_team): New function.
+ (free_team): Free gomp_work_share blocks chained through next_alloc,
+ instead of freeing work_shares and destroying work_share_lock.
+ (gomp_team_start): Change last argument from ws to team, don't create
+ new team, set ts.work_share to &team->work_shares[0] and clear
+ ts.last_work_share. Don't clear ts.work_share_generation.
+ (gomp_team_end): Call gomp_fini_work_share.
+ * work.c (gomp_new_work_share): Removed.
+ (alloc_work_share, gomp_init_work_share, gomp_fini_work_share): New
+ functions.
+ (free_work_share): Add team argument. Call gomp_fini_work_share
+ and then either free ws if orphaned, or put it into
+ work_share_list_free list of the current team.
+ (gomp_work_share_start, gomp_work_share_end,
+ gomp_work_share_end_nowait): Rewritten.
+ * sections.c (GOMP_sections_start): Call gomp_work_share_init_done
+ after gomp_sections_init. If HAVE_SYNC_BUILTINS, call
+ gomp_iter_dynamic_next instead of the _locked variant and don't take
+ lock around it, otherwise acquire it before calling
+ gomp_iter_dynamic_next_locked.
+ (GOMP_sections_next): If HAVE_SYNC_BUILTINS, call
+ gomp_iter_dynamic_next instead of the _locked variant and don't take
+ lock around it.
+ (GOMP_parallel_sections_start): Call gomp_new_team instead of
+ gomp_new_work_share. Call gomp_sections_init on &team->work_shares[0].
+ Adjust gomp_team_start caller.
+ * loop.c (gomp_loop_static_start, gomp_loop_ordered_static_start): Call
+ gomp_work_share_init_done after gomp_loop_init. Don't unlock ws->lock.
+ (gomp_loop_dynamic_start, gomp_loop_guided_start): Call
+ gomp_work_share_init_done after gomp_loop_init. If HAVE_SYNC_BUILTINS,
+ don't unlock ws->lock, otherwise lock it.
+ (gomp_loop_ordered_dynamic_start, gomp_loop_ordered_guided_start): Call
+ gomp_work_share_init_done after gomp_loop_init. Lock ws->lock.
+ (gomp_parallel_loop_start): Call gomp_new_team instead of
+ gomp_new_work_share. Call gomp_loop_init on &team->work_shares[0].
+ Adjust gomp_team_start caller.
+ * single.c (GOMP_single_start, GOMP_single_copy_start): Call
+ gomp_work_share_init_done if gomp_work_share_start returned true.
+ Don't unlock ws->lock.
+ * parallel.c (GOMP_parallel_start): Call gomp_new_team and pass that
+ as last argument to gomp_team_start.
+ * config/linux/ptrlock.c: New file.
+ * config/linux/ptrlock.h: New file.
+ * config/posix/ptrlock.c: New file.
+ * config/posix/ptrlock.h: New file.
+ * Makefile.am (libgomp_la_SOURCES): Add ptrlock.c.
+ * Makefile.in: Regenerated.
+ * testsuite/Makefile.in: Regenerated.
+
+2008-03-19 Jakub Jelinek <jakub@redhat.com>
+
+ * libgomp.h (gomp_active_wait_policy): Remove decl.
+ (gomp_throttled_spin_count_var, gomp_available_cpus,
+ gomp_managed_threads): New extern decls.
+ * team.c (gomp_team_start, gomp_team_end): If number of threads
+ changed, adjust atomically gomp_managed_threads.
+ * env.c (gomp_active_wait_policy, gomp_block_time_var): Remove.
+ (gomp_throttled_spin_count_var, gomp_available_cpus,
+ gomp_managed_threads): New variables.
+ (parse_millis): Removed.
+ (parse_spincount): New function.
+ (parse_wait_policy): Return -1/0/1 instead of setting
+ gomp_active_wait_policy.
+ (initialize_env): Call gomp_init_num_threads unconditionally.
+ Initialize gomp_available_cpus. Call parse_spincount instead
+ of parse_millis, initialize gomp_{,throttled_}spin_count_var
+ depending on presence and value of OMP_WAIT_POLICY and
+ GOMP_SPINCOUNT env vars.
+ * config/linux/wait.h (do_wait): Use gomp_throttled_spin_count_var
+ instead of gomp_spin_count_var if gomp_managed_threads >
+ gomp_available_cpus.
+
+ * config/linux/wait.h: Include errno.h.
+ (FUTEX_WAIT, FUTEX_WAKE, FUTEX_PRIVATE_FLAG): Define.
+ (gomp_futex_wake, gomp_futex_wait): New extern decls.
+ * config/linux/mutex.c (gomp_futex_wake, gomp_futex_wait): New
+ variables.
+ * config/linux/powerpc/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove.
+ (sys_futex0): Return error code.
+ (futex_wake, futex_wait): If ENOSYS was returned, clear
+ FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry.
+ * config/linux/alpha/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove.
+ (futex_wake, futex_wait): If ENOSYS was returned, clear
+ FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry.
+ * config/linux/x86/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove.
+ (sys_futex0): Return error code.
+ (futex_wake, futex_wait): If ENOSYS was returned, clear
+ FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry.
+ * config/linux/s390/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove.
+ (sys_futex0): Return error code.
+ (futex_wake, futex_wait): If ENOSYS was returned, clear
+ FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry.
+ * config/linux/ia64/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove.
+ (sys_futex0): Return error code.
+ (futex_wake, futex_wait): If ENOSYS was returned, clear
+ FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry.
+ * config/linux/sparc/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove.
+ (sys_futex0): Return error code.
+ (futex_wake, futex_wait): If ENOSYS was returned, clear
+ FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry.
+
+2008-03-18 Jakub Jelinek <jakub@redhat.com>
+
+ * libgomp.h (struct gomp_work_share): Add mode field. Put lock and
+ next into a different cache line from most of the write-once fields.
+ * loop.c: Include limits.h.
+ (gomp_loop_init): For GFS_DYNAMIC, multiply ws->chunk_size by incr.
+ If adding ws->chunk_size nthreads + 1 times after end won't
+ overflow, set ws->mode to 1.
+ * iter.c (gomp_iter_dynamic_next_locked): Don't multiply
+ ws->chunk_size by incr.
+ (gomp_iter_dynamic_next): Likewise. If ws->mode, use more efficient
+ code.
+ * work.c: Include stddef.h.
+ (gomp_new_work_share): Use offsetof rather than sizeof.
+
+2008-03-17 Jakub Jelinek <jakub@redhat.com>
+
+ * libgomp.h (struct gomp_team): Change ordered_release field
+ into gomp_sem_t ** from flexible array member. Add implicit_task
+ and initial_work_shares fields.
+ (gomp_new_task): Removed.
+ (gomp_init_task): New prototype.
+ * team.c (new_team): Allocate implicit_task for each thread
+ and initial work_shares together with gomp_team allocation.
+ (free_team): Only free work_shares if it is not init_work_shares.
+ (gomp_team_start): Use gomp_init_task instead of gomp_new_task,
+ set thr->task to the corresponding implicit_task array entry.
+ * task.c (gomp_new_task): Removed.
+ (gomp_init_task): New function.
+ (gomp_end_task): Don't free the task.
+ (GOMP_task): Allocate struct gomp_task on the stack, call
+ gomp_init_task rather than gomp_new_task.
+ * work.c (gomp_work_share_start): If work_shares ==
+ init_work_shares, gomp_malloc + memcpy rather than gomp_realloc.
+
+2008-03-15 Jakub Jelinek <jakub@redhat.com>
+ Ulrich Drepper <drepper@redhat.com>
+
+ * config/linux/bar.h (gomp_barrier_state_t): Rewritten.
+ (gomp_barrier_state_t): Change to unsigned int.
+ (gomp_barrier_init, gomp_barrier_reinit, gomp_barrier_destroy,
+ gomp_barrier_wait_start, gomp_barrier_last_thread): Rewritten.
+ (gomp_barrier_wait_last): Prototype rather than inline.
+ * config/linux/bar.c (gomp_barrier_wait_end): Rewritten.
+ (gomp_barrier_wait_last): New function.
+
+2008-03-15 Jakub Jelinek <jakub@redhat.com>
+
+ * team.c (gomp_thread_start): Use gomp_barrier_wait_last instead
+ of gomp_barrier_wait.
+ * env.c (gomp_block_time_var, gomp_spin_count_var): New variables.
+ (parse_millis): New function.
+ (initialize_env): Handle GOMP_BLOCKTIME env var.
+ * libgomp.h (struct gomp_team): Move close to the end of the struct.
+ (gomp_spin_count_var): New extern var decl.
+ * work.c (gomp_work_share_end): Use gomp_barrier_state_t bstate
+ var instead of bool last, call gomp_barrier_last_thread to check
+ for last thread, pass bstate to gomp_barrier_wait_end.
+ * config/linux/wait.h: New file.
+ * config/linux/mutex.c: Include wait.h instead of libgomp.h and
+ futex.h.
+ (gomp_mutex_lock_slow): Call do_wait instead of futex_wait.
+ * config/linux/bar.c: Include wait.h instead of libgomp.h and
+ futex.h.
+ (gomp_barrier_wait_end): Change second argument to
+ gomp_barrier_state_t. Call do_wait instead of futex_wait.
+ * config/linux/sem.c: Include wait.h instead of libgomp.h and
+ futex.h.
+ (gomp_sem_wait_slow): Call do_wait instead of futex_wait.
+ * config/linux/lock.c: Include wait.h instead of libgomp.h and
+ futex.h.
+ (gomp_set_nest_lock_25): Call do_wait instead of futex_wait.
+ * config/linux/affinity.c: Assume HAVE_SYNC_BUILTINS.
+ * config/linux/bar.h (gomp_barrier_state_t): New typedef.
+ (gomp_barrier_wait_end): Change second argument to
+ gomp_barrier_state_t.
+ (gomp_barrier_wait_start): Return gomp_barrier_state_t.
+ (gomp_barrier_last_thread, gomp_barrier_wait_last): New static
+ inlines.
+ * config/linux/powerpc/futex.h (cpu_relax, atomic_write_barrier): New
+ static inlines.
+ * config/linux/alpha/futex.h (cpu_relax, atomic_write_barrier):
+ Likewise.
+ * config/linux/x86/futex.h (cpu_relax, atomic_write_barrier):
+ Likewise.
+ * config/linux/s390/futex.h (cpu_relax, atomic_write_barrier):
+ Likewise.
+ * config/linux/ia64/futex.h (cpu_relax, atomic_write_barrier):
+ Likewise.
+ * config/linux/sparc/futex.h (cpu_relax, atomic_write_barrier):
+ Likewise.
+ * config/posix/bar.c (gomp_barrier_wait_end): Change second argument
+ to gomp_barrier_state_t.
+ * config/posix/bar.h (gomp_barrier_state_t): New typedef.
+ (gomp_barrier_wait_end): Change second argument to
+ gomp_barrier_state_t.
+ (gomp_barrier_wait_start): Return gomp_barrier_state_t.
+ (gomp_barrier_last_thread, gomp_barrier_wait_last): New static
+ inlines.
+
+--- libgomp/parallel.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/parallel.c 2008-03-26 15:32:06.000000000 +0100
+@@ -68,7 +68,7 @@ void
+ GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
+ {
+ num_threads = gomp_resolve_num_threads (num_threads);
+- gomp_team_start (fn, data, num_threads, NULL);
++ gomp_team_start (fn, data, num_threads, gomp_new_team (num_threads));
+ }
+
+ void
+--- libgomp/sections.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/sections.c 2008-03-26 15:33:06.000000000 +0100
+@@ -59,14 +59,24 @@ GOMP_sections_start (unsigned count)
+ long s, e, ret;
+
+ if (gomp_work_share_start (false))
+- gomp_sections_init (thr->ts.work_share, count);
++ {
++ gomp_sections_init (thr->ts.work_share, count);
++ gomp_work_share_init_done ();
++ }
+
++#ifdef HAVE_SYNC_BUILTINS
++ if (gomp_iter_dynamic_next (&s, &e))
++ ret = s;
++ else
++ ret = 0;
++#else
++ gomp_mutex_lock (&thr->ts.work_share->lock);
+ if (gomp_iter_dynamic_next_locked (&s, &e))
+ ret = s;
+ else
+ ret = 0;
+-
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
+
+ return ret;
+ }
+@@ -83,15 +93,23 @@ GOMP_sections_start (unsigned count)
+ unsigned
+ GOMP_sections_next (void)
+ {
+- struct gomp_thread *thr = gomp_thread ();
+ long s, e, ret;
+
++#ifdef HAVE_SYNC_BUILTINS
++ if (gomp_iter_dynamic_next (&s, &e))
++ ret = s;
++ else
++ ret = 0;
++#else
++ struct gomp_thread *thr = gomp_thread ();
++
+ gomp_mutex_lock (&thr->ts.work_share->lock);
+ if (gomp_iter_dynamic_next_locked (&s, &e))
+ ret = s;
+ else
+ ret = 0;
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
+
+ return ret;
+ }
+@@ -103,15 +121,15 @@ void
+ GOMP_parallel_sections_start (void (*fn) (void *), void *data,
+ unsigned num_threads, unsigned count)
+ {
+- struct gomp_work_share *ws;
++ struct gomp_team *team;
+
+ num_threads = gomp_resolve_num_threads (num_threads);
+ if (gomp_dyn_var && num_threads > count)
+ num_threads = count;
+
+- ws = gomp_new_work_share (false, num_threads);
+- gomp_sections_init (ws, count);
+- gomp_team_start (fn, data, num_threads, ws);
++ team = gomp_new_team (num_threads);
++ gomp_sections_init (&team->work_shares[0], count);
++ gomp_team_start (fn, data, num_threads, team);
+ }
+
+ /* The GOMP_section_end* routines are called after the thread is told
+--- libgomp/env.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/env.c 2008-03-26 16:40:26.000000000 +0100
+@@ -44,6 +44,11 @@ enum gomp_schedule_type gomp_run_sched_v
+ unsigned long gomp_run_sched_chunk = 1;
+ unsigned short *gomp_cpu_affinity;
+ size_t gomp_cpu_affinity_len;
++#ifndef HAVE_SYNC_BUILTINS
++gomp_mutex_t gomp_remaining_threads_lock;
++#endif
++unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
++unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+
+ /* Parse the OMP_SCHEDULE environment variable. */
+
+@@ -147,6 +152,79 @@ parse_unsigned_long (const char *name, u
+ return false;
+ }
+
++/* Parse the GOMP_SPINCOUNT environment varible. Return true if one was
++ present and it was successfully parsed. */
++
++static bool
++parse_spincount (const char *name, unsigned long long *pvalue)
++{
++ char *env, *end;
++ unsigned long long value, mult = 1;
++
++ env = getenv (name);
++ if (env == NULL)
++ return false;
++
++ while (isspace ((unsigned char) *env))
++ ++env;
++ if (*env == '\0')
++ goto invalid;
++
++ if (strncasecmp (env, "infinite", 8) == 0
++ || strncasecmp (env, "infinity", 8) == 0)
++ {
++ value = ~0ULL;
++ end = env + 8;
++ goto check_tail;
++ }
++
++ errno = 0;
++ value = strtoull (env, &end, 10);
++ if (errno)
++ goto invalid;
++
++ while (isspace ((unsigned char) *end))
++ ++end;
++ if (*end != '\0')
++ {
++ switch (tolower (*end))
++ {
++ case 'k':
++ mult = 1000LL;
++ break;
++ case 'm':
++ mult = 1000LL * 1000LL;
++ break;
++ case 'g':
++ mult = 1000LL * 1000LL * 1000LL;
++ break;
++ case 't':
++ mult = 1000LL * 1000LL * 1000LL * 1000LL;
++ break;
++ default:
++ goto invalid;
++ }
++ ++end;
++ check_tail:
++ while (isspace ((unsigned char) *end))
++ ++end;
++ if (*end != '\0')
++ goto invalid;
++ }
++
++ if (value > ~0ULL / mult)
++ value = ~0ULL;
++ else
++ value *= mult;
++
++ *pvalue = value;
++ return true;
++
++ invalid:
++ gomp_error ("Invalid value for environment variable %s", name);
++ return false;
++}
++
+ /* Parse a boolean value for environment variable NAME and store the
+ result in VALUE. */
+
+@@ -281,10 +359,25 @@ initialize_env (void)
+ parse_schedule ();
+ parse_boolean ("OMP_DYNAMIC", &gomp_dyn_var);
+ parse_boolean ("OMP_NESTED", &gomp_nest_var);
++ gomp_init_num_threads ();
++ gomp_available_cpus = gomp_nthreads_var;
+ if (!parse_unsigned_long ("OMP_NUM_THREADS", &gomp_nthreads_var))
+- gomp_init_num_threads ();
++ gomp_nthreads_var = gomp_available_cpus;
+ if (parse_affinity ())
+ gomp_init_affinity ();
++ if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
++ {
++ /* Using a rough estimation of 100000 spins per msec,
++ use 200 msec blocking.
++ Depending on the CPU speed, this can be e.g. 5 times longer
++ or 5 times shorter. */
++ gomp_spin_count_var = 20000000LL;
++ }
++ /* gomp_throttled_spin_count_var is used when there are more libgomp
++ managed threads than available CPUs. Use very short spinning. */
++ gomp_throttled_spin_count_var = 100LL;
++ if (gomp_throttled_spin_count_var > gomp_spin_count_var)
++ gomp_throttled_spin_count_var = gomp_spin_count_var;
+
+ /* Not strictly environment related, but ordering constructors is tricky. */
+ pthread_attr_init (&gomp_thread_attr);
+--- libgomp/libgomp.h.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/libgomp.h 2008-03-27 12:21:51.000000000 +0100
+@@ -50,6 +50,7 @@
+ #include "sem.h"
+ #include "mutex.h"
+ #include "bar.h"
++#include "ptrlock.h"
+
+
+ /* This structure contains the data to control one work-sharing construct,
+@@ -70,6 +71,8 @@ struct gomp_work_share
+ If this is a SECTIONS construct, this value will always be DYNAMIC. */
+ enum gomp_schedule_type sched;
+
++ int mode;
++
+ /* This is the chunk_size argument to the SCHEDULE clause. */
+ long chunk_size;
+
+@@ -81,17 +84,38 @@ struct gomp_work_share
+ is always 1. */
+ long incr;
+
+- /* This lock protects the update of the following members. */
+- gomp_mutex_t lock;
++ /* This is a circular queue that details which threads will be allowed
++ into the ordered region and in which order. When a thread allocates
++ iterations on which it is going to work, it also registers itself at
++ the end of the array. When a thread reaches the ordered region, it
++ checks to see if it is the one at the head of the queue. If not, it
++ blocks on its RELEASE semaphore. */
++ unsigned *ordered_team_ids;
+
+- union {
+- /* This is the next iteration value to be allocated. In the case of
+- GFS_STATIC loops, this the iteration start point and never changes. */
+- long next;
++ /* This is the number of threads that have registered themselves in
++ the circular queue ordered_team_ids. */
++ unsigned ordered_num_used;
+
+- /* This is the returned data structure for SINGLE COPYPRIVATE. */
+- void *copyprivate;
+- };
++ /* This is the team_id of the currently acknowledged owner of the ordered
++ section, or -1u if the ordered section has not been acknowledged by
++ any thread. This is distinguished from the thread that is *allowed*
++ to take the section next. */
++ unsigned ordered_owner;
++
++ /* This is the index into the circular queue ordered_team_ids of the
++ current thread that's allowed into the ordered reason. */
++ unsigned ordered_cur;
++
++ /* This is a chain of allocated gomp_work_share blocks, valid only
++ in the first gomp_work_share struct in the block. */
++ struct gomp_work_share *next_alloc;
++
++ /* The above fields are written once during workshare initialization,
++ or related to ordered worksharing. Make sure the following fields
++ are in a different cache line. */
++
++ /* This lock protects the update of the following members. */
++ gomp_mutex_t lock __attribute__((aligned (64)));
+
+ /* This is the count of the number of threads that have exited the work
+ share construct. If the construct was marked nowait, they have moved on
+@@ -99,27 +123,28 @@ struct gomp_work_share
+ of the team to exit the work share construct must deallocate it. */
+ unsigned threads_completed;
+
+- /* This is the index into the circular queue ordered_team_ids of the
+- current thread that's allowed into the ordered reason. */
+- unsigned ordered_cur;
++ union {
++ /* This is the next iteration value to be allocated. In the case of
++ GFS_STATIC loops, this the iteration start point and never changes. */
++ long next;
+
+- /* This is the number of threads that have registered themselves in
+- the circular queue ordered_team_ids. */
+- unsigned ordered_num_used;
++ /* This is the returned data structure for SINGLE COPYPRIVATE. */
++ void *copyprivate;
++ };
+
+- /* This is the team_id of the currently acknoledged owner of the ordered
+- section, or -1u if the ordered section has not been acknowledged by
+- any thread. This is distinguished from the thread that is *allowed*
+- to take the section next. */
+- unsigned ordered_owner;
++ union {
++ /* Link to gomp_work_share struct for next work sharing construct
++ encountered after this one. */
++ gomp_ptrlock_t next_ws;
++
++ /* gomp_work_share structs are chained in the free work share cache
++ through this. */
++ struct gomp_work_share *next_free;
++ };
+
+- /* This is a circular queue that details which threads will be allowed
+- into the ordered region and in which order. When a thread allocates
+- iterations on which it is going to work, it also registers itself at
+- the end of the array. When a thread reaches the ordered region, it
+- checks to see if it is the one at the head of the queue. If not, it
+- blocks on its RELEASE semaphore. */
+- unsigned ordered_team_ids[];
++ /* If only few threads are in the team, ordered_team_ids can point
++ to this array which fills the padding at the end of this struct. */
++ unsigned inline_ordered_team_ids[0];
+ };
+
+ /* This structure contains all of the thread-local data associated with
+@@ -133,21 +158,24 @@ struct gomp_team_state
+
+ /* This is the work share construct which this thread is currently
+ processing. Recall that with NOWAIT, not all threads may be
+- processing the same construct. This value is NULL when there
+- is no construct being processed. */
++ processing the same construct. */
+ struct gomp_work_share *work_share;
+
++ /* This is the previous work share construct or NULL if there wasn't any.
++ When all threads are done with the current work sharing construct,
++ the previous one can be freed. The current one can't, as its
++ next_ws field is used. */
++ struct gomp_work_share *last_work_share;
++
+ /* This is the ID of this thread within the team. This value is
+ guaranteed to be between 0 and N-1, where N is the number of
+ threads in the team. */
+ unsigned team_id;
+
+- /* The work share "generation" is a number that increases by one for
+- each work share construct encountered in the dynamic flow of the
+- program. It is used to find the control data for the work share
+- when encountering it for the first time. This particular number
+- reflects the generation of the work_share member of this struct. */
+- unsigned work_share_generation;
++#ifdef HAVE_SYNC_BUILTINS
++ /* Number of single stmts encountered. */
++ unsigned long single_count;
++#endif
+
+ /* For GFS_RUNTIME loops that resolved to GFS_STATIC, this is the
+ trip number through the loop. So first time a particular loop
+@@ -163,41 +191,53 @@ struct gomp_team_state
+
+ struct gomp_team
+ {
+- /* This lock protects access to the following work shares data structures. */
+- gomp_mutex_t work_share_lock;
+-
+- /* This is a dynamically sized array containing pointers to the control
+- structs for all "live" work share constructs. Here "live" means that
+- the construct has been encountered by at least one thread, and not
+- completed by all threads. */
+- struct gomp_work_share **work_shares;
+-
+- /* The work_shares array is indexed by "generation & generation_mask".
+- The mask will be 2**N - 1, where 2**N is the size of the array. */
+- unsigned generation_mask;
+-
+- /* These two values define the bounds of the elements of the work_shares
+- array that are currently in use. */
+- unsigned oldest_live_gen;
+- unsigned num_live_gen;
+-
+ /* This is the number of threads in the current team. */
+ unsigned nthreads;
+
++ /* This is number of gomp_work_share structs that have been allocated
++ as a block last time. */
++ unsigned work_share_chunk;
++
+ /* This is the saved team state that applied to a master thread before
+ the current thread was created. */
+ struct gomp_team_state prev_ts;
+
+- /* This barrier is used for most synchronization of the team. */
+- gomp_barrier_t barrier;
+-
+ /* This semaphore should be used by the master thread instead of its
+ "native" semaphore in the thread structure. Required for nested
+ parallels, as the master is a member of two teams. */
+ gomp_sem_t master_release;
+
+- /* This array contains pointers to the release semaphore of the threads
+- in the team. */
++ /* List of gomp_work_share structs chained through next_free fields.
++ This is populated and taken off only by the first thread in the
++ team encountering a new work sharing construct, in a critical
++ section. */
++ struct gomp_work_share *work_share_list_alloc;
++
++ /* List of gomp_work_share structs freed by free_work_share. New
++ entries are atomically added to the start of the list, and
++ alloc_work_share can safely only move all but the first entry
++ to work_share_list alloc, as free_work_share can happen concurrently
++ with alloc_work_share. */
++ struct gomp_work_share *work_share_list_free;
++
++#ifdef HAVE_SYNC_BUILTINS
++ /* Number of simple single regions encountered by threads in this
++ team. */
++ unsigned long single_count;
++#else
++ /* Mutex protecting addition of workshares to work_share_list_free. */
++ gomp_mutex_t work_share_list_free_lock;
++#endif
++
++ /* This barrier is used for most synchronization of the team. */
++ gomp_barrier_t barrier;
++
++ /* Initial work shares, to avoid allocating any gomp_work_share
++ structs in the common case. */
++ struct gomp_work_share work_shares[8];
++
++ /* This is an array with pointers to the release semaphore
++ of the threads in the team. */
+ gomp_sem_t *ordered_release[];
+ };
+
+@@ -242,6 +282,11 @@ extern bool gomp_dyn_var;
+ extern bool gomp_nest_var;
+ extern enum gomp_schedule_type gomp_run_sched_var;
+ extern unsigned long gomp_run_sched_chunk;
++#ifndef HAVE_SYNC_BUILTINS
++extern gomp_mutex_t gomp_remaining_threads_lock;
++#endif
++extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
++extern unsigned long gomp_available_cpus, gomp_managed_threads;
+
+ /* The attributes to be used during thread creation. */
+ extern pthread_attr_t gomp_thread_attr;
+@@ -306,17 +351,27 @@ extern unsigned gomp_dynamic_max_threads
+
+ /* team.c */
+
++extern struct gomp_team *gomp_new_team (unsigned);
+ extern void gomp_team_start (void (*) (void *), void *, unsigned,
+- struct gomp_work_share *);
++ struct gomp_team *);
+ extern void gomp_team_end (void);
+
+ /* work.c */
+
+-extern struct gomp_work_share * gomp_new_work_share (bool, unsigned);
++extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned);
++extern void gomp_fini_work_share (struct gomp_work_share *);
+ extern bool gomp_work_share_start (bool);
+ extern void gomp_work_share_end (void);
+ extern void gomp_work_share_end_nowait (void);
+
++static inline void
++gomp_work_share_init_done (void)
++{
++ struct gomp_thread *thr = gomp_thread ();
++ if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
++ gomp_ptrlock_set (&thr->ts.last_work_share->next_ws, thr->ts.work_share);
++}
++
+ #ifdef HAVE_ATTRIBUTE_VISIBILITY
+ # pragma GCC visibility pop
+ #endif
+--- libgomp/iter.c.jj 2008-03-26 14:48:34.000000000 +0100
++++ libgomp/iter.c 2008-03-26 15:11:23.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -154,7 +154,7 @@ gomp_iter_dynamic_next_locked (long *pst
+ if (start == ws->end)
+ return false;
+
+- chunk = ws->chunk_size * ws->incr;
++ chunk = ws->chunk_size;
+ left = ws->end - start;
+ if (ws->incr < 0)
+ {
+@@ -186,11 +186,38 @@ gomp_iter_dynamic_next (long *pstart, lo
+ struct gomp_work_share *ws = thr->ts.work_share;
+ long start, end, nend, chunk, incr;
+
+- start = ws->next;
+ end = ws->end;
+ incr = ws->incr;
+- chunk = ws->chunk_size * incr;
++ chunk = ws->chunk_size;
++
++ if (__builtin_expect (ws->mode, 1))
++ {
++ long tmp = __sync_fetch_and_add (&ws->next, chunk);
++ if (incr > 0)
++ {
++ if (tmp >= end)
++ return false;
++ nend = tmp + chunk;
++ if (nend > end)
++ nend = end;
++ *pstart = tmp;
++ *pend = nend;
++ return true;
++ }
++ else
++ {
++ if (tmp <= end)
++ return false;
++ nend = tmp + chunk;
++ if (nend < end)
++ nend = end;
++ *pstart = tmp;
++ *pend = nend;
++ return true;
++ }
++ }
+
++ start = ws->next;
+ while (1)
+ {
+ long left = end - start;
+--- libgomp/work.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/work.c 2008-03-27 12:21:51.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -29,39 +29,138 @@
+ of threads. */
+
+ #include "libgomp.h"
++#include <stddef.h>
+ #include <stdlib.h>
+ #include <string.h>
+
+
+-/* Create a new work share structure. */
++/* Allocate a new work share structure, preferably from current team's
++ free gomp_work_share cache. */
+
+-struct gomp_work_share *
+-gomp_new_work_share (bool ordered, unsigned nthreads)
++static struct gomp_work_share *
++alloc_work_share (struct gomp_team *team)
+ {
+ struct gomp_work_share *ws;
+- size_t size;
++ unsigned int i;
+
+- size = sizeof (*ws);
+- if (ordered)
+- size += nthreads * sizeof (ws->ordered_team_ids[0]);
++ /* This is called in a critical section. */
++ if (team->work_share_list_alloc != NULL)
++ {
++ ws = team->work_share_list_alloc;
++ team->work_share_list_alloc = ws->next_free;
++ return ws;
++ }
+
+- ws = gomp_malloc_cleared (size);
+- gomp_mutex_init (&ws->lock);
+- ws->ordered_owner = -1;
++#ifdef HAVE_SYNC_BUILTINS
++ ws = team->work_share_list_free;
++ /* We need atomic read from work_share_list_free,
++ as free_work_share can be called concurrently. */
++ __asm ("" : "+r" (ws));
++
++ if (ws && ws->next_free)
++ {
++ struct gomp_work_share *next = ws->next_free;
++ ws->next_free = NULL;
++ team->work_share_list_alloc = next->next_free;
++ return next;
++ }
++#else
++ gomp_mutex_lock (&team->work_share_list_free_lock);
++ ws = team->work_share_list_free;
++ if (ws)
++ {
++ team->work_share_list_alloc = ws->next_free;
++ team->work_share_list_free = NULL;
++ gomp_mutex_unlock (&team->work_share_list_free_lock);
++ return ws;
++ }
++ gomp_mutex_unlock (&team->work_share_list_free_lock);
++#endif
+
++ team->work_share_chunk *= 2;
++ ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share));
++ ws->next_alloc = team->work_shares[0].next_alloc;
++ team->work_shares[0].next_alloc = ws;
++ team->work_share_list_alloc = &ws[1];
++ for (i = 1; i < team->work_share_chunk - 1; i++)
++ ws[i].next_free = &ws[i + 1];
++ ws[i].next_free = NULL;
+ return ws;
+ }
+
++/* Initialize an already allocated struct gomp_work_share.
++ This shouldn't touch the next_alloc field. */
++
++void
++gomp_init_work_share (struct gomp_work_share *ws, bool ordered,
++ unsigned nthreads)
++{
++ gomp_mutex_init (&ws->lock);
++ if (__builtin_expect (ordered, 0))
++ {
++#define INLINE_ORDERED_TEAM_IDS_CNT \
++ ((sizeof (struct gomp_work_share) \
++ - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \
++ / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0]))
++
++ if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT)
++ ws->ordered_team_ids
++ = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids));
++ else
++ ws->ordered_team_ids = ws->inline_ordered_team_ids;
++ memset (ws->ordered_team_ids, '\0',
++ nthreads * sizeof (*ws->ordered_team_ids));
++ ws->ordered_num_used = 0;
++ ws->ordered_owner = -1;
++ ws->ordered_cur = 0;
++ }
++ else
++ ws->ordered_team_ids = NULL;
++ gomp_ptrlock_init (&ws->next_ws, NULL);
++ ws->threads_completed = 0;
++}
+
+-/* Free a work share structure. */
++/* Do any needed destruction of gomp_work_share fields before it
++ is put back into free gomp_work_share cache or freed. */
+
+-static void
+-free_work_share (struct gomp_work_share *ws)
++void
++gomp_fini_work_share (struct gomp_work_share *ws)
+ {
+ gomp_mutex_destroy (&ws->lock);
+- free (ws);
++ if (ws->ordered_team_ids != ws->inline_ordered_team_ids)
++ free (ws->ordered_team_ids);
++ gomp_ptrlock_destroy (&ws->next_ws);
+ }
+
++/* Free a work share struct, if not orphaned, put it into current
++ team's free gomp_work_share cache. */
++
++static inline void
++free_work_share (struct gomp_team *team, struct gomp_work_share *ws)
++{
++ gomp_fini_work_share (ws);
++ if (__builtin_expect (team == NULL, 0))
++ free (ws);
++ else
++ {
++ struct gomp_work_share *next_ws;
++#ifdef HAVE_SYNC_BUILTINS
++ do
++ {
++ next_ws = team->work_share_list_free;
++ ws->next_free = next_ws;
++ }
++ while (!__sync_bool_compare_and_swap (&team->work_share_list_free,
++ next_ws, ws));
++#else
++ gomp_mutex_lock (&team->work_share_list_free_lock);
++ next_ws = team->work_share_list_free;
++ ws->next_free = next_ws;
++ team->work_share_list_free = ws;
++ gomp_mutex_unlock (&team->work_share_list_free_lock);
++#endif
++ }
++}
+
+ /* The current thread is ready to begin the next work sharing construct.
+ In all cases, thr->ts.work_share is updated to point to the new
+@@ -74,71 +173,34 @@ gomp_work_share_start (bool ordered)
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+ struct gomp_work_share *ws;
+- unsigned ws_index, ws_gen;
+
+ /* Work sharing constructs can be orphaned. */
+ if (team == NULL)
+ {
+- ws = gomp_new_work_share (ordered, 1);
++ ws = gomp_malloc (sizeof (*ws));
++ gomp_init_work_share (ws, ordered, 1);
+ thr->ts.work_share = ws;
+- thr->ts.static_trip = 0;
+- gomp_mutex_lock (&ws->lock);
+- return true;
++ return ws;
+ }
+
+- gomp_mutex_lock (&team->work_share_lock);
+-
+- /* This thread is beginning its next generation. */
+- ws_gen = ++thr->ts.work_share_generation;
+-
+- /* If this next generation is not newer than any other generation in
+- the team, then simply reference the existing construct. */
+- if (ws_gen - team->oldest_live_gen < team->num_live_gen)
++ ws = thr->ts.work_share;
++ thr->ts.last_work_share = ws;
++ ws = gomp_ptrlock_get (&ws->next_ws);
++ if (ws == NULL)
+ {
+- ws_index = ws_gen & team->generation_mask;
+- ws = team->work_shares[ws_index];
++ /* This thread encountered a new ws first. */
++ struct gomp_work_share *ws = alloc_work_share (team);
++ gomp_init_work_share (ws, ordered, team->nthreads);
+ thr->ts.work_share = ws;
+- thr->ts.static_trip = 0;
+-
+- gomp_mutex_lock (&ws->lock);
+- gomp_mutex_unlock (&team->work_share_lock);
+-
+- return false;
++ return true;
+ }
+-
+- /* Resize the work shares queue if we've run out of space. */
+- if (team->num_live_gen++ == team->generation_mask)
++ else
+ {
+- team->work_shares = gomp_realloc (team->work_shares,
+- 2 * team->num_live_gen
+- * sizeof (*team->work_shares));
+-
+- /* Unless oldest_live_gen is zero, the sequence of live elements
+- wraps around the end of the array. If we do nothing, we break
+- lookup of the existing elements. Fix that by unwrapping the
+- data from the front to the end. */
+- if (team->oldest_live_gen > 0)
+- memcpy (team->work_shares + team->num_live_gen,
+- team->work_shares,
+- (team->oldest_live_gen & team->generation_mask)
+- * sizeof (*team->work_shares));
+-
+- team->generation_mask = team->generation_mask * 2 + 1;
+- }
+-
+- ws_index = ws_gen & team->generation_mask;
+- ws = gomp_new_work_share (ordered, team->nthreads);
+- thr->ts.work_share = ws;
+- thr->ts.static_trip = 0;
+- team->work_shares[ws_index] = ws;
+-
+- gomp_mutex_lock (&ws->lock);
+- gomp_mutex_unlock (&team->work_share_lock);
+-
+- return true;
++ thr->ts.work_share = ws;
++ return false;
++ }
+ }
+
+-
+ /* The current thread is done with its current work sharing construct.
+ This version does imply a barrier at the end of the work-share. */
+
+@@ -147,36 +209,28 @@ gomp_work_share_end (void)
+ {
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+- struct gomp_work_share *ws = thr->ts.work_share;
+- bool last;
+-
+- thr->ts.work_share = NULL;
++ gomp_barrier_state_t bstate;
+
+ /* Work sharing constructs can be orphaned. */
+ if (team == NULL)
+ {
+- free_work_share (ws);
++ free_work_share (NULL, thr->ts.work_share);
++ thr->ts.work_share = NULL;
+ return;
+ }
+
+- last = gomp_barrier_wait_start (&team->barrier);
++ bstate = gomp_barrier_wait_start (&team->barrier);
+
+- if (last)
++ if (gomp_barrier_last_thread (bstate))
+ {
+- unsigned ws_index;
+-
+- ws_index = thr->ts.work_share_generation & team->generation_mask;
+- team->work_shares[ws_index] = NULL;
+- team->oldest_live_gen++;
+- team->num_live_gen = 0;
+-
+- free_work_share (ws);
++ if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
++ free_work_share (team, thr->ts.last_work_share);
+ }
+
+- gomp_barrier_wait_end (&team->barrier, last);
++ gomp_barrier_wait_end (&team->barrier, bstate);
++ thr->ts.last_work_share = NULL;
+ }
+
+-
+ /* The current thread is done with its current work sharing construct.
+ This version does NOT imply a barrier at the end of the work-share. */
+
+@@ -188,15 +242,17 @@ gomp_work_share_end_nowait (void)
+ struct gomp_work_share *ws = thr->ts.work_share;
+ unsigned completed;
+
+- thr->ts.work_share = NULL;
+-
+ /* Work sharing constructs can be orphaned. */
+ if (team == NULL)
+ {
+- free_work_share (ws);
++ free_work_share (NULL, ws);
++ thr->ts.work_share = NULL;
+ return;
+ }
+
++ if (__builtin_expect (thr->ts.last_work_share == NULL, 0))
++ return;
++
+ #ifdef HAVE_SYNC_BUILTINS
+ completed = __sync_add_and_fetch (&ws->threads_completed, 1);
+ #else
+@@ -206,18 +262,6 @@ gomp_work_share_end_nowait (void)
+ #endif
+
+ if (completed == team->nthreads)
+- {
+- unsigned ws_index;
+-
+- gomp_mutex_lock (&team->work_share_lock);
+-
+- ws_index = thr->ts.work_share_generation & team->generation_mask;
+- team->work_shares[ws_index] = NULL;
+- team->oldest_live_gen++;
+- team->num_live_gen--;
+-
+- gomp_mutex_unlock (&team->work_share_lock);
+-
+- free_work_share (ws);
+- }
++ free_work_share (team, thr->ts.last_work_share);
++ thr->ts.last_work_share = NULL;
+ }
+--- libgomp/single.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/single.c 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -37,10 +37,24 @@
+ bool
+ GOMP_single_start (void)
+ {
++#ifdef HAVE_SYNC_BUILTINS
++ struct gomp_thread *thr = gomp_thread ();
++ struct gomp_team *team = thr->ts.team;
++ unsigned long single_count;
++
++ if (__builtin_expect (team == NULL, 0))
++ return true;
++
++ single_count = thr->ts.single_count++;
++ return __sync_bool_compare_and_swap (&team->single_count, single_count,
++ single_count + 1L);
++#else
+ bool ret = gomp_work_share_start (false);
+- gomp_mutex_unlock (&gomp_thread ()->ts.work_share->lock);
++ if (ret)
++ gomp_work_share_init_done ();
+ gomp_work_share_end_nowait ();
+ return ret;
++#endif
+ }
+
+ /* This routine is called when first encountering a SINGLE construct that
+@@ -57,10 +71,12 @@ GOMP_single_copy_start (void)
+ void *ret;
+
+ first = gomp_work_share_start (false);
+- gomp_mutex_unlock (&thr->ts.work_share->lock);
+
+ if (first)
+- ret = NULL;
++ {
++ gomp_work_share_init_done ();
++ ret = NULL;
++ }
+ else
+ {
+ gomp_barrier_wait (&thr->ts.team->barrier);
+--- libgomp/loop.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/loop.c 2008-03-26 18:47:04.000000000 +0100
+@@ -27,8 +27,9 @@
+
+ /* This file handles the LOOP (FOR/DO) construct. */
+
+-#include "libgomp.h"
++#include <limits.h>
+ #include <stdlib.h>
++#include "libgomp.h"
+
+
+ /* Initialize the given work share construct from the given arguments. */
+@@ -44,6 +45,39 @@ gomp_loop_init (struct gomp_work_share *
+ ? start : end;
+ ws->incr = incr;
+ ws->next = start;
++ if (sched == GFS_DYNAMIC)
++ {
++ ws->chunk_size *= incr;
++
++#ifdef HAVE_SYNC_BUILTINS
++ {
++ /* For dynamic scheduling prepare things to make each iteration
++ faster. */
++ struct gomp_thread *thr = gomp_thread ();
++ struct gomp_team *team = thr->ts.team;
++ long nthreads = team ? team->nthreads : 1;
++
++ if (__builtin_expect (incr > 0, 1))
++ {
++ /* Cheap overflow protection. */
++ if (__builtin_expect ((nthreads | ws->chunk_size)
++ >= 1UL << (sizeof (long)
++ * __CHAR_BIT__ / 2 - 1), 0))
++ ws->mode = 0;
++ else
++ ws->mode = ws->end < (LONG_MAX
++ - (nthreads + 1) * ws->chunk_size);
++ }
++ /* Cheap overflow protection. */
++ else if (__builtin_expect ((nthreads | -ws->chunk_size)
++ >= 1UL << (sizeof (long)
++ * __CHAR_BIT__ / 2 - 1), 0))
++ ws->mode = 0;
++ else
++ ws->mode = ws->end > (nthreads + 1) * -ws->chunk_size - LONG_MAX;
++ }
++#endif
++ }
+ }
+
+ /* The *_start routines are called when first encountering a loop construct
+@@ -68,10 +102,13 @@ gomp_loop_static_start (long start, long
+ {
+ struct gomp_thread *thr = gomp_thread ();
+
++ thr->ts.static_trip = 0;
+ if (gomp_work_share_start (false))
+- gomp_loop_init (thr->ts.work_share, start, end, incr,
+- GFS_STATIC, chunk_size);
+- gomp_mutex_unlock (&thr->ts.work_share->lock);
++ {
++ gomp_loop_init (thr->ts.work_share, start, end, incr,
++ GFS_STATIC, chunk_size);
++ gomp_work_share_init_done ();
++ }
+
+ return !gomp_iter_static_next (istart, iend);
+ }
+@@ -84,13 +121,16 @@ gomp_loop_dynamic_start (long start, lon
+ bool ret;
+
+ if (gomp_work_share_start (false))
+- gomp_loop_init (thr->ts.work_share, start, end, incr,
+- GFS_DYNAMIC, chunk_size);
++ {
++ gomp_loop_init (thr->ts.work_share, start, end, incr,
++ GFS_DYNAMIC, chunk_size);
++ gomp_work_share_init_done ();
++ }
+
+ #ifdef HAVE_SYNC_BUILTINS
+- gomp_mutex_unlock (&thr->ts.work_share->lock);
+ ret = gomp_iter_dynamic_next (istart, iend);
+ #else
++ gomp_mutex_lock (&thr->ts.work_share->lock);
+ ret = gomp_iter_dynamic_next_locked (istart, iend);
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
+ #endif
+@@ -106,13 +146,16 @@ gomp_loop_guided_start (long start, long
+ bool ret;
+
+ if (gomp_work_share_start (false))
+- gomp_loop_init (thr->ts.work_share, start, end, incr,
+- GFS_GUIDED, chunk_size);
++ {
++ gomp_loop_init (thr->ts.work_share, start, end, incr,
++ GFS_GUIDED, chunk_size);
++ gomp_work_share_init_done ();
++ }
+
+ #ifdef HAVE_SYNC_BUILTINS
+- gomp_mutex_unlock (&thr->ts.work_share->lock);
+ ret = gomp_iter_guided_next (istart, iend);
+ #else
++ gomp_mutex_lock (&thr->ts.work_share->lock);
+ ret = gomp_iter_guided_next_locked (istart, iend);
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
+ #endif
+@@ -149,13 +192,14 @@ gomp_loop_ordered_static_start (long sta
+ {
+ struct gomp_thread *thr = gomp_thread ();
+
++ thr->ts.static_trip = 0;
+ if (gomp_work_share_start (true))
+ {
+ gomp_loop_init (thr->ts.work_share, start, end, incr,
+ GFS_STATIC, chunk_size);
+ gomp_ordered_static_init ();
++ gomp_work_share_init_done ();
+ }
+- gomp_mutex_unlock (&thr->ts.work_share->lock);
+
+ return !gomp_iter_static_next (istart, iend);
+ }
+@@ -168,8 +212,14 @@ gomp_loop_ordered_dynamic_start (long st
+ bool ret;
+
+ if (gomp_work_share_start (true))
+- gomp_loop_init (thr->ts.work_share, start, end, incr,
+- GFS_DYNAMIC, chunk_size);
++ {
++ gomp_loop_init (thr->ts.work_share, start, end, incr,
++ GFS_DYNAMIC, chunk_size);
++ gomp_mutex_lock (&thr->ts.work_share->lock);
++ gomp_work_share_init_done ();
++ }
++ else
++ gomp_mutex_lock (&thr->ts.work_share->lock);
+
+ ret = gomp_iter_dynamic_next_locked (istart, iend);
+ if (ret)
+@@ -187,8 +237,14 @@ gomp_loop_ordered_guided_start (long sta
+ bool ret;
+
+ if (gomp_work_share_start (true))
+- gomp_loop_init (thr->ts.work_share, start, end, incr,
+- GFS_GUIDED, chunk_size);
++ {
++ gomp_loop_init (thr->ts.work_share, start, end, incr,
++ GFS_GUIDED, chunk_size);
++ gomp_mutex_lock (&thr->ts.work_share->lock);
++ gomp_work_share_init_done ();
++ }
++ else
++ gomp_mutex_lock (&thr->ts.work_share->lock);
+
+ ret = gomp_iter_guided_next_locked (istart, iend);
+ if (ret)
+@@ -375,12 +431,12 @@ gomp_parallel_loop_start (void (*fn) (vo
+ long incr, enum gomp_schedule_type sched,
+ long chunk_size)
+ {
+- struct gomp_work_share *ws;
++ struct gomp_team *team;
+
+ num_threads = gomp_resolve_num_threads (num_threads);
+- ws = gomp_new_work_share (false, num_threads);
+- gomp_loop_init (ws, start, end, incr, sched, chunk_size);
+- gomp_team_start (fn, data, num_threads, ws);
++ team = gomp_new_team (num_threads);
++ gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
++ gomp_team_start (fn, data, num_threads, team);
+ }
+
+ void
+--- libgomp/Makefile.in.jj 2008-01-10 20:53:47.000000000 +0100
++++ libgomp/Makefile.in 2008-03-26 18:51:01.000000000 +0100
+@@ -83,7 +83,7 @@ libgomp_la_LIBADD =
+ am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
+ error.lo iter.lo loop.lo ordered.lo parallel.lo sections.lo \
+ single.lo team.lo work.lo lock.lo mutex.lo proc.lo sem.lo \
+- bar.lo time.lo fortran.lo affinity.lo
++ bar.lo ptrlock.lo time.lo fortran.lo affinity.lo
+ libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
+ DEFAULT_INCLUDES = -I. -I$(srcdir) -I.
+ depcomp = $(SHELL) $(top_srcdir)/../depcomp
+@@ -292,7 +292,7 @@ libgomp_version_info = -version-info $(l
+ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script)
+ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
+ loop.c ordered.c parallel.c sections.c single.c team.c work.c \
+- lock.c mutex.c proc.c sem.c bar.c time.c fortran.c affinity.c
++ lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c
+
+ nodist_noinst_HEADERS = libgomp_f.h
+ nodist_libsubinclude_HEADERS = omp.h
+@@ -434,6 +434,7 @@ distclean-compile:
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
+--- libgomp/testsuite/libgomp.c/loop-4.c.jj 2008-03-26 18:47:04.000000000 +0100
++++ libgomp/testsuite/libgomp.c/loop-4.c 2008-03-26 18:47:04.000000000 +0100
+@@ -0,0 +1,28 @@
++/* { dg-do run } */
++
++extern void abort (void);
++
++int
++main (void)
++{
++ int e = 0;
++#pragma omp parallel num_threads (4) reduction(+:e)
++ {
++ long i;
++ #pragma omp for schedule(dynamic,1)
++ for (i = __LONG_MAX__ - 30001; i <= __LONG_MAX__ - 10001; i += 10000)
++ if (i != __LONG_MAX__ - 30001
++ && i != __LONG_MAX__ - 20001
++ && i != __LONG_MAX__ - 10001)
++ e = 1;
++ #pragma omp for schedule(dynamic,1)
++ for (i = -__LONG_MAX__ + 30000; i >= -__LONG_MAX__ + 10000; i -= 10000)
++ if (i != -__LONG_MAX__ + 30000
++ && i != -__LONG_MAX__ + 20000
++ && i != -__LONG_MAX__ + 10000)
++ e = 1;
++ }
++ if (e)
++ abort ();
++ return 0;
++}
+--- libgomp/Makefile.am.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/Makefile.am 2008-03-26 15:15:19.000000000 +0100
+@@ -31,7 +31,7 @@ libgomp_la_LDFLAGS = $(libgomp_version_i
+
+ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
+ loop.c ordered.c parallel.c sections.c single.c team.c work.c \
+- lock.c mutex.c proc.c sem.c bar.c time.c fortran.c affinity.c
++ lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c
+
+ nodist_noinst_HEADERS = libgomp_f.h
+ nodist_libsubinclude_HEADERS = omp.h
+--- libgomp/team.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/team.c 2008-03-27 12:22:26.000000000 +0100
+@@ -94,7 +94,7 @@ gomp_thread_start (void *xdata)
+ {
+ gomp_barrier_wait (&thr->ts.team->barrier);
+ local_fn (local_data);
+- gomp_barrier_wait (&thr->ts.team->barrier);
++ gomp_barrier_wait_last (&thr->ts.team->barrier);
+ }
+ else
+ {
+@@ -114,11 +114,10 @@ gomp_thread_start (void *xdata)
+ thr->data = NULL;
+ thr->ts.team = NULL;
+ thr->ts.work_share = NULL;
++ thr->ts.last_work_share = NULL;
+ thr->ts.team_id = 0;
+- thr->ts.work_share_generation = 0;
+- thr->ts.static_trip = 0;
+
+- gomp_barrier_wait (&team->barrier);
++ gomp_barrier_wait_last (&team->barrier);
+ gomp_barrier_wait (&gomp_threads_dock);
+
+ local_fn = thr->fn;
+@@ -133,21 +132,29 @@ gomp_thread_start (void *xdata)
+
+ /* Create a new team data structure. */
+
+-static struct gomp_team *
+-new_team (unsigned nthreads, struct gomp_work_share *work_share)
++struct gomp_team *
++gomp_new_team (unsigned nthreads)
+ {
+ struct gomp_team *team;
+ size_t size;
++ int i;
+
+ size = sizeof (*team) + nthreads * sizeof (team->ordered_release[0]);
+ team = gomp_malloc (size);
+- gomp_mutex_init (&team->work_share_lock);
+
+- team->work_shares = gomp_malloc (4 * sizeof (struct gomp_work_share *));
+- team->generation_mask = 3;
+- team->oldest_live_gen = work_share == NULL;
+- team->num_live_gen = work_share != NULL;
+- team->work_shares[0] = work_share;
++ team->work_share_chunk = 8;
++#ifdef HAVE_SYNC_BUILTINS
++ team->single_count = 0;
++#else
++ gomp_mutex_init (&team->work_share_list_free_lock);
++#endif
++ gomp_init_work_share (&team->work_shares[0], false, nthreads);
++ team->work_shares[0].next_alloc = NULL;
++ team->work_share_list_free = NULL;
++ team->work_share_list_alloc = &team->work_shares[1];
++ for (i = 1; i < 7; i++)
++ team->work_shares[i].next_free = &team->work_shares[i + 1];
++ team->work_shares[i].next_free = NULL;
+
+ team->nthreads = nthreads;
+ gomp_barrier_init (&team->barrier, nthreads);
+@@ -164,10 +171,22 @@ new_team (unsigned nthreads, struct gomp
+ static void
+ free_team (struct gomp_team *team)
+ {
+- free (team->work_shares);
+- gomp_mutex_destroy (&team->work_share_lock);
++ if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0))
++ {
++ struct gomp_work_share *ws = team->work_shares[0].next_alloc;
++ do
++ {
++ struct gomp_work_share *next_ws = ws->next_alloc;
++ free (ws);
++ ws = next_ws;
++ }
++ while (ws != NULL);
++ }
+ gomp_barrier_destroy (&team->barrier);
+ gomp_sem_destroy (&team->master_release);
++#ifndef HAVE_SYNC_BUILTINS
++ gomp_mutex_destroy (&team->work_share_list_free_lock);
++#endif
+ free (team);
+ }
+
+@@ -176,11 +195,10 @@ free_team (struct gomp_team *team)
+
+ void
+ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+- struct gomp_work_share *work_share)
++ struct gomp_team *team)
+ {
+ struct gomp_thread_start_data *start_data;
+ struct gomp_thread *thr, *nthr;
+- struct gomp_team *team;
+ bool nested;
+ unsigned i, n, old_threads_used = 0;
+ pthread_attr_t thread_attr, *attr;
+@@ -188,17 +206,18 @@ gomp_team_start (void (*fn) (void *), vo
+ thr = gomp_thread ();
+ nested = thr->ts.team != NULL;
+
+- team = new_team (nthreads, work_share);
+-
+ /* Always save the previous state, even if this isn't a nested team.
+ In particular, we should save any work share state from an outer
+ orphaned work share construct. */
+ team->prev_ts = thr->ts;
+
+ thr->ts.team = team;
+- thr->ts.work_share = work_share;
+ thr->ts.team_id = 0;
+- thr->ts.work_share_generation = 0;
++ thr->ts.work_share = &team->work_shares[0];
++ thr->ts.last_work_share = NULL;
++#ifdef HAVE_SYNC_BUILTINS
++ thr->ts.single_count = 0;
++#endif
+ thr->ts.static_trip = 0;
+
+ if (nthreads == 1)
+@@ -241,9 +260,12 @@ gomp_team_start (void (*fn) (void *), vo
+ {
+ nthr = gomp_threads[i];
+ nthr->ts.team = team;
+- nthr->ts.work_share = work_share;
++ nthr->ts.work_share = &team->work_shares[0];
++ nthr->ts.last_work_share = NULL;
+ nthr->ts.team_id = i;
+- nthr->ts.work_share_generation = 0;
++#ifdef HAVE_SYNC_BUILTINS
++ nthr->ts.single_count = 0;
++#endif
+ nthr->ts.static_trip = 0;
+ nthr->fn = fn;
+ nthr->data = data;
+@@ -266,8 +288,24 @@ gomp_team_start (void (*fn) (void *), vo
+ }
+ }
+
++ if (__builtin_expect (nthreads > old_threads_used, 0))
++ {
++ long diff = (long) nthreads - (long) old_threads_used;
++
++ if (old_threads_used == 0)
++ --diff;
++
++#ifdef HAVE_SYNC_BUILTINS
++ __sync_fetch_and_add (&gomp_managed_threads, diff);
++#else
++ gomp_mutex_lock (&gomp_remaining_threads_lock);
++ gomp_managed_threads += diff;
++ gomp_mutex_unlock (&gomp_remaining_threads_lock);
++#endif
++ }
++
+ attr = &gomp_thread_attr;
+- if (gomp_cpu_affinity != NULL)
++ if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
+ {
+ size_t stacksize;
+ pthread_attr_init (&thread_attr);
+@@ -287,9 +325,12 @@ gomp_team_start (void (*fn) (void *), vo
+ int err;
+
+ start_data->ts.team = team;
+- start_data->ts.work_share = work_share;
++ start_data->ts.work_share = &team->work_shares[0];
++ start_data->ts.last_work_share = NULL;
+ start_data->ts.team_id = i;
+- start_data->ts.work_share_generation = 0;
++#ifdef HAVE_SYNC_BUILTINS
++ start_data->ts.single_count = 0;
++#endif
+ start_data->ts.static_trip = 0;
+ start_data->fn = fn;
+ start_data->fn_data = data;
+@@ -303,7 +344,7 @@ gomp_team_start (void (*fn) (void *), vo
+ gomp_fatal ("Thread creation failed: %s", strerror (err));
+ }
+
+- if (gomp_cpu_affinity != NULL)
++ if (__builtin_expect (gomp_cpu_affinity != NULL, 0))
+ pthread_attr_destroy (&thread_attr);
+
+ do_release:
+@@ -313,8 +354,20 @@ gomp_team_start (void (*fn) (void *), vo
+ that should arrive back at the end of this team. The extra
+ threads should be exiting. Note that we arrange for this test
+ to never be true for nested teams. */
+- if (nthreads < old_threads_used)
+- gomp_barrier_reinit (&gomp_threads_dock, nthreads);
++ if (__builtin_expect (nthreads < old_threads_used, 0))
++ {
++ long diff = (long) nthreads - (long) old_threads_used;
++
++ gomp_barrier_reinit (&gomp_threads_dock, nthreads);
++
++#ifdef HAVE_SYNC_BUILTINS
++ __sync_fetch_and_add (&gomp_managed_threads, diff);
++#else
++ gomp_mutex_lock (&gomp_remaining_threads_lock);
++ gomp_managed_threads += diff;
++ gomp_mutex_unlock (&gomp_remaining_threads_lock);
++#endif
++ }
+ }
+
+
+@@ -329,8 +382,21 @@ gomp_team_end (void)
+
+ gomp_barrier_wait (&team->barrier);
+
++ gomp_fini_work_share (thr->ts.work_share);
++
+ thr->ts = team->prev_ts;
+
++ if (__builtin_expect (thr->ts.team != NULL, 0))
++ {
++#ifdef HAVE_SYNC_BUILTINS
++ __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
++#else
++ gomp_mutex_lock (&gomp_remaining_threads_lock);
++ gomp_managed_threads -= team->nthreads - 1L;
++ gomp_mutex_unlock (&gomp_remaining_threads_lock);
++#endif
++ }
++
+ free_team (team);
+ }
+
+--- libgomp/config/posix/bar.h.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/config/posix/bar.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -46,18 +46,32 @@ typedef struct
+ unsigned total;
+ unsigned arrived;
+ } gomp_barrier_t;
++typedef bool gomp_barrier_state_t;
+
+ extern void gomp_barrier_init (gomp_barrier_t *, unsigned);
+ extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned);
+ extern void gomp_barrier_destroy (gomp_barrier_t *);
+
+ extern void gomp_barrier_wait (gomp_barrier_t *);
+-extern void gomp_barrier_wait_end (gomp_barrier_t *, bool);
++extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+
+-static inline bool gomp_barrier_wait_start (gomp_barrier_t *bar)
++static inline gomp_barrier_state_t
++gomp_barrier_wait_start (gomp_barrier_t *bar)
+ {
+ gomp_mutex_lock (&bar->mutex1);
+ return ++bar->arrived == bar->total;
+ }
+
++static inline bool
++gomp_barrier_last_thread (gomp_barrier_state_t state)
++{
++ return state;
++}
++
++static inline void
++gomp_barrier_wait_last (gomp_barrier_t *bar)
++{
++ gomp_barrier_wait (bar);
++}
++
+ #endif /* GOMP_BARRIER_H */
+--- libgomp/config/posix/ptrlock.h.jj 2008-03-26 15:11:32.000000000 +0100
++++ libgomp/config/posix/ptrlock.h 2008-03-26 15:11:32.000000000 +0100
+@@ -0,0 +1,69 @@
++/* Copyright (C) 2008 Free Software Foundation, Inc.
++ Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++ This file is part of the GNU OpenMP Library (libgomp).
++
++ Libgomp is free software; you can redistribute it and/or modify it
++ under the terms of the GNU Lesser General Public License as published by
++ the Free Software Foundation; either version 2.1 of the License, or
++ (at your option) any later version.
++
++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
++ more details.
++
++ You should have received a copy of the GNU Lesser General Public License
++ along with libgomp; see the file COPYING.LIB. If not, write to the
++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
++ MA 02110-1301, USA. */
++
++/* As a special exception, if you link this library with other files, some
++ of which are compiled with GCC, to produce an executable, this library
++ does not by itself cause the resulting executable to be covered by the
++ GNU General Public License. This exception does not however invalidate
++ any other reasons why the executable file might be covered by the GNU
++ General Public License. */
++
++/* This is a Linux specific implementation of a mutex synchronization
++ mechanism for libgomp. This type is private to the library. This
++ implementation uses atomic instructions and the futex syscall. */
++
++#ifndef GOMP_PTRLOCK_H
++#define GOMP_PTRLOCK_H 1
++
++typedef struct { void *ptr; gomp_mutex_t lock; } gomp_ptrlock_t;
++
++static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr)
++{
++ ptrlock->ptr = ptr;
++ gomp_mutex_init (&ptrlock->lock);
++}
++
++static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock)
++{
++ if (ptrlock->ptr != NULL)
++ return ptrlock->ptr;
++
++ gomp_mutex_lock (&ptrlock->lock);
++ if (ptrlock->ptr != NULL)
++ {
++ gomp_mutex_unlock (&ptrlock->lock);
++ return ptrlock->ptr;
++ }
++
++ return NULL;
++}
++
++static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr)
++{
++ ptrlock->ptr = ptr;
++ gomp_mutex_unlock (&ptrlock->lock);
++}
++
++static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock)
++{
++ gomp_mutex_destroy (&ptrlock->lock);
++}
++
++#endif /* GOMP_PTRLOCK_H */
+--- libgomp/config/posix/ptrlock.c.jj 2008-03-26 15:11:32.000000000 +0100
++++ libgomp/config/posix/ptrlock.c 2008-03-26 15:11:32.000000000 +0100
+@@ -0,0 +1 @@
++/* Everything is in the header. */
+--- libgomp/config/posix/bar.c.jj 2007-12-07 14:41:01.000000000 +0100
++++ libgomp/config/posix/bar.c 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -70,7 +70,7 @@ gomp_barrier_reinit (gomp_barrier_t *bar
+ }
+
+ void
+-gomp_barrier_wait_end (gomp_barrier_t *bar, bool last)
++gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t last)
+ {
+ unsigned int n;
+
+--- libgomp/config/linux/alpha/futex.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/alpha/futex.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -30,8 +30,6 @@
+ #ifndef SYS_futex
+ #define SYS_futex 394
+ #endif
+-#define FUTEX_WAIT 0
+-#define FUTEX_WAKE 1
+
+
+ static inline void
+@@ -45,7 +43,7 @@ futex_wait (int *addr, int val)
+
+ sc_0 = SYS_futex;
+ sc_16 = (long) addr;
+- sc_17 = FUTEX_WAIT;
++ sc_17 = gomp_futex_wait;
+ sc_18 = val;
+ sc_19 = 0;
+ __asm volatile ("callsys"
+@@ -53,6 +51,20 @@ futex_wait (int *addr, int val)
+ : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18), "1"(sc_19)
+ : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8",
+ "$22", "$23", "$24", "$25", "$27", "$28", "memory");
++ if (__builtin_expect (sc_19, 0) && sc_0 == ENOSYS)
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sc_0 = SYS_futex;
++ sc_17 &= ~FUTEX_PRIVATE_FLAG;
++ sc_19 = 0;
++ __asm volatile ("callsys"
++ : "=r" (sc_0), "=r"(sc_19)
++ : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18),
++ "1"(sc_19)
++ : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8",
++ "$22", "$23", "$24", "$25", "$27", "$28", "memory");
++ }
+ }
+
+ static inline void
+@@ -66,11 +78,35 @@ futex_wake (int *addr, int count)
+
+ sc_0 = SYS_futex;
+ sc_16 = (long) addr;
+- sc_17 = FUTEX_WAKE;
++ sc_17 = gomp_futex_wake;
+ sc_18 = count;
+ __asm volatile ("callsys"
+ : "=r" (sc_0), "=r"(sc_19)
+ : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18)
+ : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8",
+ "$22", "$23", "$24", "$25", "$27", "$28", "memory");
++ if (__builtin_expect (sc_19, 0) && sc_0 == ENOSYS)
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sc_0 = SYS_futex;
++ sc_17 &= ~FUTEX_PRIVATE_FLAG;
++ __asm volatile ("callsys"
++ : "=r" (sc_0), "=r"(sc_19)
++ : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18)
++ : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8",
++ "$22", "$23", "$24", "$25", "$27", "$28", "memory");
++ }
++}
++
++static inline void
++cpu_relax (void)
++{
++ __asm volatile ("" : : : "memory");
++}
++
++static inline void
++atomic_write_barrier (void)
++{
++ __asm volatile ("wmb" : : : "memory");
+ }
+--- libgomp/config/linux/affinity.c.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/affinity.c 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2006, 2007 Free Software Foundation, Inc.
++/* Copyright (C) 2006, 2007, 2008 Free Software Foundation, Inc.
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -38,9 +38,6 @@
+ #ifdef HAVE_PTHREAD_AFFINITY_NP
+
+ static unsigned int affinity_counter;
+-#ifndef HAVE_SYNC_BUILTINS
+-static gomp_mutex_t affinity_lock;
+-#endif
+
+ void
+ gomp_init_affinity (void)
+@@ -76,9 +73,6 @@ gomp_init_affinity (void)
+ CPU_SET (gomp_cpu_affinity[0], &cpuset);
+ pthread_setaffinity_np (pthread_self (), sizeof (cpuset), &cpuset);
+ affinity_counter = 1;
+-#ifndef HAVE_SYNC_BUILTINS
+- gomp_mutex_init (&affinity_lock);
+-#endif
+ }
+
+ void
+@@ -87,13 +81,7 @@ gomp_init_thread_affinity (pthread_attr_
+ unsigned int cpu;
+ cpu_set_t cpuset;
+
+-#ifdef HAVE_SYNC_BUILTINS
+ cpu = __sync_fetch_and_add (&affinity_counter, 1);
+-#else
+- gomp_mutex_lock (&affinity_lock);
+- cpu = affinity_counter++;
+- gomp_mutex_unlock (&affinity_lock);
+-#endif
+ cpu %= gomp_cpu_affinity_len;
+ CPU_ZERO (&cpuset);
+ CPU_SET (gomp_cpu_affinity[cpu], &cpuset);
+--- libgomp/config/linux/bar.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/bar.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -36,40 +36,49 @@
+
+ typedef struct
+ {
+- gomp_mutex_t mutex;
+- unsigned total;
+- unsigned arrived;
+- int generation;
++ /* Make sure total/generation is in a mostly read cacheline, while
++ awaited in a separate cacheline. */
++ unsigned total __attribute__((aligned (64)));
++ unsigned generation;
++ unsigned awaited __attribute__((aligned (64)));
+ } gomp_barrier_t;
++typedef unsigned int gomp_barrier_state_t;
+
+ static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+ {
+- gomp_mutex_init (&bar->mutex);
+ bar->total = count;
+- bar->arrived = 0;
++ bar->awaited = count;
+ bar->generation = 0;
+ }
+
+ static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+ {
+- gomp_mutex_lock (&bar->mutex);
++ __sync_fetch_and_add (&bar->awaited, count - bar->total);
+ bar->total = count;
+- gomp_mutex_unlock (&bar->mutex);
+ }
+
+ static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+ {
+- /* Before destroying, make sure all threads have left the barrier. */
+- gomp_mutex_lock (&bar->mutex);
+ }
+
+ extern void gomp_barrier_wait (gomp_barrier_t *);
+-extern void gomp_barrier_wait_end (gomp_barrier_t *, bool);
++extern void gomp_barrier_wait_last (gomp_barrier_t *);
++extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+
+-static inline bool gomp_barrier_wait_start (gomp_barrier_t *bar)
++static inline gomp_barrier_state_t
++gomp_barrier_wait_start (gomp_barrier_t *bar)
+ {
+- gomp_mutex_lock (&bar->mutex);
+- return ++bar->arrived == bar->total;
++ unsigned int ret = bar->generation;
++ /* Do we need any barrier here or is __sync_add_and_fetch acting
++ as the needed LoadLoad barrier already? */
++ ret += __sync_add_and_fetch (&bar->awaited, -1) == 0;
++ return ret;
++}
++
++static inline bool
++gomp_barrier_last_thread (gomp_barrier_state_t state)
++{
++ return state & 1;
+ }
+
+ #endif /* GOMP_BARRIER_H */
+--- libgomp/config/linux/ptrlock.h.jj 2008-03-26 15:11:32.000000000 +0100
++++ libgomp/config/linux/ptrlock.h 2008-03-26 15:11:32.000000000 +0100
+@@ -0,0 +1,65 @@
++/* Copyright (C) 2008 Free Software Foundation, Inc.
++ Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++ This file is part of the GNU OpenMP Library (libgomp).
++
++ Libgomp is free software; you can redistribute it and/or modify it
++ under the terms of the GNU Lesser General Public License as published by
++ the Free Software Foundation; either version 2.1 of the License, or
++ (at your option) any later version.
++
++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
++ more details.
++
++ You should have received a copy of the GNU Lesser General Public License
++ along with libgomp; see the file COPYING.LIB. If not, write to the
++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
++ MA 02110-1301, USA. */
++
++/* As a special exception, if you link this library with other files, some
++ of which are compiled with GCC, to produce an executable, this library
++ does not by itself cause the resulting executable to be covered by the
++ GNU General Public License. This exception does not however invalidate
++ any other reasons why the executable file might be covered by the GNU
++ General Public License. */
++
++/* This is a Linux specific implementation of a mutex synchronization
++ mechanism for libgomp. This type is private to the library. This
++ implementation uses atomic instructions and the futex syscall. */
++
++#ifndef GOMP_PTRLOCK_H
++#define GOMP_PTRLOCK_H 1
++
++typedef void *gomp_ptrlock_t;
++
++static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr)
++{
++ *ptrlock = ptr;
++}
++
++extern void *gomp_ptrlock_get_slow (gomp_ptrlock_t *ptrlock);
++static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock)
++{
++ if ((uintptr_t) *ptrlock > 2)
++ return *ptrlock;
++
++ if (__sync_bool_compare_and_swap (ptrlock, NULL, (uintptr_t) 1))
++ return NULL;
++
++ return gomp_ptrlock_get_slow (ptrlock);
++}
++
++extern void gomp_ptrlock_set_slow (gomp_ptrlock_t *ptrlock, void *ptr);
++static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr)
++{
++ if (!__sync_bool_compare_and_swap (ptrlock, (uintptr_t) 1, ptr))
++ gomp_ptrlock_set_slow (ptrlock, ptr);
++}
++
++static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock)
++{
++}
++
++#endif /* GOMP_PTRLOCK_H */
+--- libgomp/config/linux/lock.c.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/lock.c 2008-03-26 15:11:32.000000000 +0100
+@@ -29,11 +29,10 @@
+ primitives. This implementation uses atomic instructions and the futex
+ syscall. */
+
+-#include "libgomp.h"
+ #include <string.h>
+ #include <unistd.h>
+ #include <sys/syscall.h>
+-#include "futex.h"
++#include "wait.h"
+
+
+ /* The internal gomp_mutex_t and the external non-recursive omp_lock_t
+@@ -137,7 +136,7 @@ omp_set_nest_lock (omp_nest_lock_t *lock
+ return;
+ }
+
+- futex_wait (&lock->owner, otid);
++ do_wait (&lock->owner, otid);
+ }
+ }
+
+--- libgomp/config/linux/ptrlock.c.jj 2008-03-26 15:11:32.000000000 +0100
++++ libgomp/config/linux/ptrlock.c 2008-03-26 15:11:32.000000000 +0100
+@@ -0,0 +1,70 @@
++/* Copyright (C) 2008 Free Software Foundation, Inc.
++ Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++ This file is part of the GNU OpenMP Library (libgomp).
++
++ Libgomp is free software; you can redistribute it and/or modify it
++ under the terms of the GNU Lesser General Public License as published by
++ the Free Software Foundation; either version 2.1 of the License, or
++ (at your option) any later version.
++
++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
++ more details.
++
++ You should have received a copy of the GNU Lesser General Public License
++ along with libgomp; see the file COPYING.LIB. If not, write to the
++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
++ MA 02110-1301, USA. */
++
++/* As a special exception, if you link this library with other files, some
++ of which are compiled with GCC, to produce an executable, this library
++ does not by itself cause the resulting executable to be covered by the
++ GNU General Public License. This exception does not however invalidate
++ any other reasons why the executable file might be covered by the GNU
++ General Public License. */
++
++/* This is a Linux specific implementation of a mutex synchronization
++ mechanism for libgomp. This type is private to the library. This
++ implementation uses atomic instructions and the futex syscall. */
++
++#include <endian.h>
++#include <limits.h>
++#include "wait.h"
++
++void *
++gomp_ptrlock_get_slow (gomp_ptrlock_t *ptrlock)
++{
++ int *intptr;
++ __sync_bool_compare_and_swap (ptrlock, 1, 2);
++
++ /* futex works on ints, not pointers.
++ But a valid work share pointer will be at least
++ 8 byte aligned, so it is safe to assume the low
++ 32-bits of the pointer won't contain values 1 or 2. */
++ __asm volatile ("" : "=r" (intptr) : "0" (ptrlock));
++#if __BYTE_ORDER == __BIG_ENDIAN
++ if (sizeof (*ptrlock) > sizeof (int))
++ intptr += (sizeof (*ptrlock) / sizeof (int)) - 1;
++#endif
++ do
++ do_wait (intptr, 2);
++ while (*intptr == 2);
++ __asm volatile ("" : : : "memory");
++ return *ptrlock;
++}
++
++void
++gomp_ptrlock_set_slow (gomp_ptrlock_t *ptrlock, void *ptr)
++{
++ int *intptr;
++
++ *ptrlock = ptr;
++ __asm volatile ("" : "=r" (intptr) : "0" (ptrlock));
++#if __BYTE_ORDER == __BIG_ENDIAN
++ if (sizeof (*ptrlock) > sizeof (int))
++ intptr += (sizeof (*ptrlock) / sizeof (int)) - 1;
++#endif
++ futex_wake (intptr, INT_MAX);
++}
+--- libgomp/config/linux/x86/futex.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/x86/futex.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -27,9 +27,6 @@
+
+ /* Provide target-specific access to the futex system call. */
+
+-#define FUTEX_WAIT 0
+-#define FUTEX_WAKE 1
+-
+ #ifdef __LP64__
+ # ifndef SYS_futex
+ # define SYS_futex 202
+@@ -38,14 +35,26 @@
+ static inline void
+ futex_wait (int *addr, int val)
+ {
+- register long r10 __asm__("%r10") = 0;
++ register long r10 __asm__("%r10");
+ long res;
+
++ r10 = 0;
+ __asm volatile ("syscall"
+ : "=a" (res)
+- : "0"(SYS_futex), "D" (addr), "S"(FUTEX_WAIT),
+- "d"(val), "r"(r10)
++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wait),
++ "d" (val), "r" (r10)
+ : "r11", "rcx", "memory");
++ if (__builtin_expect (res == -ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ r10 = 0;
++ __asm volatile ("syscall"
++ : "=a" (res)
++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wait),
++ "d" (val), "r" (r10)
++ : "r11", "rcx", "memory");
++ }
+ }
+
+ static inline void
+@@ -55,8 +64,19 @@ futex_wake (int *addr, int count)
+
+ __asm volatile ("syscall"
+ : "=a" (res)
+- : "0"(SYS_futex), "D" (addr), "S"(FUTEX_WAKE), "d"(count)
++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wake),
++ "d" (count)
+ : "r11", "rcx", "memory");
++ if (__builtin_expect (res == -ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ __asm volatile ("syscall"
++ : "=a" (res)
++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wake),
++ "d" (count)
++ : "r11", "rcx", "memory");
++ }
+ }
+ #else
+ # ifndef SYS_futex
+@@ -65,7 +85,7 @@ futex_wake (int *addr, int count)
+
+ # ifdef __PIC__
+
+-static inline void
++static inline long
+ sys_futex0 (int *addr, int op, int val)
+ {
+ long res;
+@@ -77,11 +97,12 @@ sys_futex0 (int *addr, int op, int val)
+ : "0"(SYS_futex), "r" (addr), "c"(op),
+ "d"(val), "S"(0)
+ : "memory");
++ return res;
+ }
+
+ # else
+
+-static inline void
++static inline long
+ sys_futex0 (int *addr, int op, int val)
+ {
+ long res;
+@@ -91,6 +112,7 @@ sys_futex0 (int *addr, int op, int val)
+ : "0"(SYS_futex), "b" (addr), "c"(op),
+ "d"(val), "S"(0)
+ : "memory");
++ return res;
+ }
+
+ # endif /* __PIC__ */
+@@ -98,13 +120,37 @@ sys_futex0 (int *addr, int op, int val)
+ static inline void
+ futex_wait (int *addr, int val)
+ {
+- sys_futex0 (addr, FUTEX_WAIT, val);
++ long res = sys_futex0 (addr, gomp_futex_wait, val);
++ if (__builtin_expect (res == -ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wait, val);
++ }
+ }
+
+ static inline void
+ futex_wake (int *addr, int count)
+ {
+- sys_futex0 (addr, FUTEX_WAKE, count);
++ long res = sys_futex0 (addr, gomp_futex_wake, count);
++ if (__builtin_expect (res == -ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wake, count);
++ }
+ }
+
+ #endif /* __LP64__ */
++
++static inline void
++cpu_relax (void)
++{
++ __asm volatile ("rep; nop" : : : "memory");
++}
++
++static inline void
++atomic_write_barrier (void)
++{
++ __sync_synchronize ();
++}
+--- libgomp/config/linux/wait.h.jj 2008-03-26 15:11:32.000000000 +0100
++++ libgomp/config/linux/wait.h 2008-03-26 15:11:32.000000000 +0100
+@@ -0,0 +1,68 @@
++/* Copyright (C) 2008 Free Software Foundation, Inc.
++ Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++ This file is part of the GNU OpenMP Library (libgomp).
++
++ Libgomp is free software; you can redistribute it and/or modify it
++ under the terms of the GNU Lesser General Public License as published by
++ the Free Software Foundation; either version 2.1 of the License, or
++ (at your option) any later version.
++
++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for
++ more details.
++
++ You should have received a copy of the GNU Lesser General Public License
++ along with libgomp; see the file COPYING.LIB. If not, write to the
++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
++ MA 02110-1301, USA. */
++
++/* As a special exception, if you link this library with other files, some
++ of which are compiled with GCC, to produce an executable, this library
++ does not by itself cause the resulting executable to be covered by the
++ GNU General Public License. This exception does not however invalidate
++ any other reasons why the executable file might be covered by the GNU
++ General Public License. */
++
++/* This is a Linux specific implementation of a mutex synchronization
++ mechanism for libgomp. This type is private to the library. This
++ implementation uses atomic instructions and the futex syscall. */
++
++#ifndef GOMP_WAIT_H
++#define GOMP_WAIT_H 1
++
++#include "libgomp.h"
++#include <errno.h>
++
++#define FUTEX_WAIT 0
++#define FUTEX_WAKE 1
++#define FUTEX_PRIVATE_FLAG 128L
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility push(hidden)
++#endif
++
++extern long int gomp_futex_wait, gomp_futex_wake;
++
++#include "futex.h"
++
++static inline void do_wait (int *addr, int val)
++{
++ unsigned long long i, count = gomp_spin_count_var;
++
++ if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0))
++ count = gomp_throttled_spin_count_var;
++ for (i = 0; i < count; i++)
++ if (__builtin_expect (*addr != val, 0))
++ return;
++ else
++ cpu_relax ();
++ futex_wait (addr, val);
++}
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility pop
++#endif
++
++#endif /* GOMP_WAIT_H */
+--- libgomp/config/linux/sparc/futex.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/sparc/futex.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -28,10 +28,8 @@
+ /* Provide target-specific access to the futex system call. */
+
+ #include <sys/syscall.h>
+-#define FUTEX_WAIT 0
+-#define FUTEX_WAKE 1
+
+-static inline void
++static inline long
+ sys_futex0 (int *addr, int op, int val)
+ {
+ register long int g1 __asm__ ("g1");
+@@ -47,9 +45,9 @@ sys_futex0 (int *addr, int op, int val)
+ o3 = 0;
+
+ #ifdef __arch64__
+-# define SYSCALL_STRING "ta\t0x6d"
++# define SYSCALL_STRING "ta\t0x6d; bcs,a,pt %%xcc, 1f; sub %%g0, %%o0, %%o0; 1:"
+ #else
+-# define SYSCALL_STRING "ta\t0x10"
++# define SYSCALL_STRING "ta\t0x10; bcs,a 1f; sub %%g0, %%o0, %%o0; 1:"
+ #endif
+
+ __asm volatile (SYSCALL_STRING
+@@ -65,16 +63,49 @@ sys_futex0 (int *addr, int op, int val)
+ "f48", "f50", "f52", "f54", "f56", "f58", "f60", "f62",
+ #endif
+ "cc", "memory");
++ return o0;
+ }
+
+ static inline void
+ futex_wait (int *addr, int val)
+ {
+- sys_futex0 (addr, FUTEX_WAIT, val);
++ long err = sys_futex0 (addr, gomp_futex_wait, val);
++ if (__builtin_expect (err == ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wait, val);
++ }
+ }
+
+ static inline void
+ futex_wake (int *addr, int count)
+ {
+- sys_futex0 (addr, FUTEX_WAKE, count);
++ long err = sys_futex0 (addr, gomp_futex_wake, count);
++ if (__builtin_expect (err == ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wake, count);
++ }
++}
++
++static inline void
++cpu_relax (void)
++{
++#if defined __arch64__ || defined __sparc_v9__
++ __asm volatile ("membar #LoadLoad" : : : "memory");
++#else
++ __asm volatile ("" : : : "memory");
++#endif
++}
++
++static inline void
++atomic_write_barrier (void)
++{
++#if defined __arch64__ || defined __sparc_v9__
++ __asm volatile ("membar #StoreStore" : : : "memory");
++#else
++ __sync_synchronize ();
++#endif
+ }
+--- libgomp/config/linux/ia64/futex.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/ia64/futex.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -29,23 +29,24 @@
+
+ #include <sys/syscall.h>
+
+-#define FUTEX_WAIT 0
+-#define FUTEX_WAKE 1
+
+
+-static inline void
+-sys_futex0(int *addr, int op, int val)
++static inline long
++sys_futex0(int *addr, long op, int val)
+ {
+ register long out0 asm ("out0") = (long) addr;
+ register long out1 asm ("out1") = op;
+ register long out2 asm ("out2") = val;
+ register long out3 asm ("out3") = 0;
++ register long r8 asm ("r8");
++ register long r10 asm ("r10");
+ register long r15 asm ("r15") = SYS_futex;
+
+ __asm __volatile ("break 0x100000"
+- : "=r"(r15), "=r"(out0), "=r"(out1), "=r"(out2), "=r"(out3)
++ : "=r"(r15), "=r"(out0), "=r"(out1), "=r"(out2), "=r"(out3),
++ "=r"(r8), "=r"(r10)
+ : "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3)
+- : "memory", "r8", "r10", "out4", "out5", "out6", "out7",
++ : "memory", "out4", "out5", "out6", "out7",
+ /* Non-stacked integer registers, minus r8, r10, r15. */
+ "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18",
+ "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27",
+@@ -56,16 +57,41 @@ sys_futex0(int *addr, int op, int val)
+ "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
+ /* Branch registers. */
+ "b6");
++ return r8 & r10;
+ }
+
+ static inline void
+ futex_wait (int *addr, int val)
+ {
+- sys_futex0 (addr, FUTEX_WAIT, val);
++ long err = sys_futex0 (addr, gomp_futex_wait, val);
++ if (__builtin_expect (err == ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wait, val);
++ }
+ }
+
+ static inline void
+ futex_wake (int *addr, int count)
+ {
+- sys_futex0 (addr, FUTEX_WAKE, count);
++ long err = sys_futex0 (addr, gomp_futex_wake, count);
++ if (__builtin_expect (err == ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wake, count);
++ }
++}
++
++static inline void
++cpu_relax (void)
++{
++ __asm volatile ("hint @pause" : : : "memory");
++}
++
++static inline void
++atomic_write_barrier (void)
++{
++ __sync_synchronize ();
+ }
+--- libgomp/config/linux/s390/futex.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/s390/futex.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -28,10 +28,8 @@
+ /* Provide target-specific access to the futex system call. */
+
+ #include <sys/syscall.h>
+-#define FUTEX_WAIT 0
+-#define FUTEX_WAKE 1
+
+-static inline void
++static inline long
+ sys_futex0 (int *addr, int op, int val)
+ {
+ register long int gpr2 __asm__ ("2");
+@@ -49,16 +47,41 @@ sys_futex0 (int *addr, int op, int val)
+ : "i" (SYS_futex),
+ "0" (gpr2), "d" (gpr3), "d" (gpr4), "d" (gpr5)
+ : "memory");
++ return gpr2;
+ }
+
+ static inline void
+ futex_wait (int *addr, int val)
+ {
+- sys_futex0 (addr, FUTEX_WAIT, val);
++ long err = sys_futex0 (addr, gomp_futex_wait, val);
++ if (__builtin_expect (err == -ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wait, val);
++ }
+ }
+
+ static inline void
+ futex_wake (int *addr, int count)
+ {
+- sys_futex0 (addr, FUTEX_WAKE, count);
++ long err = sys_futex0 (addr, gomp_futex_wake, count);
++ if (__builtin_expect (err == -ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wake, count);
++ }
++}
++
++static inline void
++cpu_relax (void)
++{
++ __asm volatile ("" : : : "memory");
++}
++
++static inline void
++atomic_write_barrier (void)
++{
++ __sync_synchronize ();
+ }
+--- libgomp/config/linux/mutex.c.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/mutex.c 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -29,9 +29,10 @@
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and the futex syscall. */
+
+-#include "libgomp.h"
+-#include "futex.h"
++#include "wait.h"
+
++long int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
++long int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG;
+
+ void
+ gomp_mutex_lock_slow (gomp_mutex_t *mutex)
+@@ -40,7 +41,7 @@ gomp_mutex_lock_slow (gomp_mutex_t *mute
+ {
+ int oldval = __sync_val_compare_and_swap (mutex, 1, 2);
+ if (oldval != 0)
+- futex_wait (mutex, 2);
++ do_wait (mutex, 2);
+ }
+ while (!__sync_bool_compare_and_swap (mutex, 0, 2));
+ }
+--- libgomp/config/linux/sem.c.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/sem.c 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -29,8 +29,7 @@
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and the futex syscall. */
+
+-#include "libgomp.h"
+-#include "futex.h"
++#include "wait.h"
+
+
+ void
+@@ -44,7 +43,7 @@ gomp_sem_wait_slow (gomp_sem_t *sem)
+ if (__sync_bool_compare_and_swap (sem, val, val - 1))
+ return;
+ }
+- futex_wait (sem, -1);
++ do_wait (sem, -1);
+ }
+ }
+
+--- libgomp/config/linux/powerpc/futex.h.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/powerpc/futex.h 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -28,10 +28,8 @@
+ /* Provide target-specific access to the futex system call. */
+
+ #include <sys/syscall.h>
+-#define FUTEX_WAIT 0
+-#define FUTEX_WAKE 1
+
+-static inline void
++static inline long
+ sys_futex0 (int *addr, int op, int val)
+ {
+ register long int r0 __asm__ ("r0");
+@@ -50,21 +48,48 @@ sys_futex0 (int *addr, int op, int val)
+ doesn't. It doesn't much matter for us. In the interest of unity,
+ go ahead and clobber it always. */
+
+- __asm volatile ("sc"
++ __asm volatile ("sc; mfcr %0"
+ : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6)
+ : "r"(r0), "r"(r3), "r"(r4), "r"(r5), "r"(r6)
+ : "r7", "r8", "r9", "r10", "r11", "r12",
+ "cr0", "ctr", "memory");
++ if (__builtin_expect (r0 & (1 << 28), 0))
++ return r3;
++ return 0;
+ }
+
+ static inline void
+ futex_wait (int *addr, int val)
+ {
+- sys_futex0 (addr, FUTEX_WAIT, val);
++ long err = sys_futex0 (addr, gomp_futex_wait, val);
++ if (__builtin_expect (err == ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wait, val);
++ }
+ }
+
+ static inline void
+ futex_wake (int *addr, int count)
+ {
+- sys_futex0 (addr, FUTEX_WAKE, count);
++ long err = sys_futex0 (addr, gomp_futex_wake, count);
++ if (__builtin_expect (err == ENOSYS, 0))
++ {
++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG;
++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG;
++ sys_futex0 (addr, gomp_futex_wake, count);
++ }
++}
++
++static inline void
++cpu_relax (void)
++{
++ __asm volatile ("" : : : "memory");
++}
++
++static inline void
++atomic_write_barrier (void)
++{
++ __asm volatile ("eieio" : : : "memory");
+ }
+--- libgomp/config/linux/bar.c.jj 2007-12-07 14:41:00.000000000 +0100
++++ libgomp/config/linux/bar.c 2008-03-26 15:11:32.000000000 +0100
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005 Free Software Foundation, Inc.
++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc.
+ Contributed by Richard Henderson <rth@redhat.com>.
+
+ This file is part of the GNU OpenMP Library (libgomp).
+@@ -29,32 +29,29 @@
+ mechanism for libgomp. This type is private to the library. This
+ implementation uses atomic instructions and the futex syscall. */
+
+-#include "libgomp.h"
+-#include "futex.h"
+ #include <limits.h>
++#include "wait.h"
+
+
+ void
+-gomp_barrier_wait_end (gomp_barrier_t *bar, bool last)
++gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+ {
+- if (last)
++ if (__builtin_expect ((state & 1) != 0, 0))
+ {
+- bar->generation++;
+- futex_wake (&bar->generation, INT_MAX);
++ /* Next time we'll be awaiting TOTAL threads again. */
++ bar->awaited = bar->total;
++ atomic_write_barrier ();
++ bar->generation += 2;
++ futex_wake ((int *) &bar->generation, INT_MAX);
+ }
+ else
+ {
+- unsigned int generation = bar->generation;
+-
+- gomp_mutex_unlock (&bar->mutex);
++ unsigned int generation = state;
+
+ do
+- futex_wait (&bar->generation, generation);
++ do_wait ((int *) &bar->generation, generation);
+ while (bar->generation == generation);
+ }
+-
+- if (__sync_add_and_fetch (&bar->arrived, -1) == 0)
+- gomp_mutex_unlock (&bar->mutex);
+ }
+
+ void
+@@ -62,3 +59,18 @@ gomp_barrier_wait (gomp_barrier_t *barri
+ {
+ gomp_barrier_wait_end (barrier, gomp_barrier_wait_start (barrier));
+ }
++
++/* Like gomp_barrier_wait, except that if the encountering thread
++ is not the last one to hit the barrier, it returns immediately.
++ The intended usage is that a thread which intends to gomp_barrier_destroy
++ this barrier calls gomp_barrier_wait, while all other threads
++ call gomp_barrier_wait_last. When gomp_barrier_wait returns,
++ the barrier can be safely destroyed. */
++
++void
++gomp_barrier_wait_last (gomp_barrier_t *barrier)
++{
++ gomp_barrier_state_t state = gomp_barrier_wait_start (barrier);
++ if (state & 1)
++ gomp_barrier_wait_end (barrier, state);
++}