=== modified file 'configure.ac'
--- configure.ac	2012-05-29 12:20:02 +0000
+++ configure.ac	2012-06-09 00:28:37 +0000
@@ -378,7 +378,8 @@
 dnl
 dnl Check for atomic operations support in the compiler
 dnl
-AC_MSG_CHECKING([for atomic operations support])
+AC_CHECK_HEADERS(atomic)
+AC_MSG_CHECKING([for GNU atomic operations support])
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
     int n = 0;
 ]],[[

=== modified file 'lib/profiler/Profiler.cc'
--- lib/profiler/Profiler.cc	2012-01-20 18:55:04 +0000
+++ lib/profiler/Profiler.cc	2012-06-07 00:52:43 +0000
@@ -118,10 +118,6 @@
 
 #if USE_XPROF_STATS
 
-
-#if HAVE_ASSERT_H
-#include <assert.h>
-#endif
 #if HAVE_GNUMALLLOC_H
 #include <gnumalloc.h>
 #elif HAVE_MALLOC_H
@@ -131,39 +127,10 @@
 #include <unistd.h>
 #endif
 
-/* Exported Data */
 TimersArray *xprof_Timers = NULL;
-
-/* Private stuff */
-
-
-/* new stuff */
-#define	MAXSTACKDEPTH	512
-
-struct _callstack_entry {
-    int timer;		/* index into timers array */
-    const char *name;
-    hrtime_t start, stop, accum;
-};
-
-struct _callstack_entry cstack[MAXSTACKDEPTH];
-int cstack_head = 0;
-
-#if defined(_MSC_VER) /* Microsoft C Compiler ONLY */
-static __inline void
-#else
-static inline void
-#endif
-xprof_update(xprof_stats_data * head)
-{
-    if (head->delta < head->best)
-        head->best = head->delta;
-    if (head->worst < head->delta)
-        head->worst = head->delta;
-    head->summ += head->delta;
-    head->count++;
-}
-
+uint64_t *xprof_ActiveTimers = NULL;
+uint64_t xprof_ActiveTimerCount = 0;
+std::atomic_flag xprof_GlobalLock = ATOMIC_FLAG_INIT;
 static xprof_stats_data *xp_UNACCOUNTED;
 static int xprof_inited = 0;
 
@@ -173,71 +140,72 @@
     if (xprof_inited)
         return;
 
-    xprof_Timers = static_cast<TimersArray *>(calloc(XPROF_LAST + 2, sizeof(xprof_stats_node)));
-
-    xprof_Timers[XPROF_PROF_UNACCOUNTED]->name = "PROF_UNACCOUNTED";
-    xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu.start = get_tick();
-    xp_UNACCOUNTED = &xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu;
-    cstack_head = 0;
+    while(std::atomic_flag_test_and_set_explicit(&xprof_globalLock, std::memory_order_acquire))
+         ; // spin until the global lock is acquired
+    xprof_TimerStats = static_cast<TimersArray *>(calloc(XPROF_LAST + 2, sizeof(xprof_stats_node)));
+    xprof_ActiveTimers = static_cast<uint64_t *>(calloc(XPROF_LAST + 2, sizeof(uint64_t)));
+    xprof_ActiveTimerCount = static_cast<uint64_t *>(calloc(XPROF_LAST + 2, sizeof(uint64_t)));
     xprof_inited = 1;
-}
-
-void
-xprof_start(xprof_type type, const char *timer)
-{
-    hrtime_t tt = get_tick();
-    if (!xprof_inited)
-        xprof_InitLib();
-
-    /* If nested, stop current stack frame */
-    if (cstack_head > 0) {
-        cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start;
-        cstack[cstack_head - 1].start = -1;
-    }
-
-    /* Are we at the first level? If so; stop the unaccounted timer */
-    if (cstack_head == 0) {
-        assert(xp_UNACCOUNTED->start != -1);
-        xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start;
-        xp_UNACCOUNTED->start = -1;
-        xprof_update(xp_UNACCOUNTED);
-    }
-
-    /* Allocate new stack frame */
-    cstack[cstack_head].start = tt;
-    cstack[cstack_head].stop = -1;
-    cstack[cstack_head].accum = 0;
-    cstack[cstack_head].timer = type;
-    cstack[cstack_head].name = timer;
-    cstack_head++;
-    assert(cstack_head < MAXSTACKDEPTH);
-
-}
-
-void
-xprof_stop(xprof_type type, const char *timer)
-{
-    hrtime_t tt = get_tick();
-    assert(cstack_head > 0);
-    cstack_head--;
-    assert(cstack[cstack_head].timer == type);
-
-    /* Record timer details */
-    cstack[cstack_head].accum += tt - cstack[cstack_head].start;
-    xprof_Timers[type]->accu.delta = cstack[cstack_head].accum;
-    xprof_Timers[type]->name = timer;
-
-    /* Update */
-    xprof_update(&xprof_Timers[type]->accu);
-
-    /* Restart previous timer if we're not at the top level */
-    if (cstack_head > 0) {
-        cstack[cstack_head - 1].start = tt;
-        cstack[cstack_head - 1].stop = 0;
-        return;
-    }
-    /* Get here? We're at the top level; unaccounted */
-    xp_UNACCOUNTED->start = tt;
+    std::atomic_flag_clear_explicit(&xprof_globalLock, std::memory_order_release);
+}
+
+void
+XProfilerNode::start()
+{
+    if (started_ > 0)
+        return;
+
+    started_ = get_tick();
+
+    // kick initialization of timers statistics (just in case)
+    xprof_InitLib();
+
+    while(std::atomic_flag_test_and_set_explicit(&xprof_globalLock, std::memory_order_acquire))
+         ; // spin until the global lock is acquired
+    xprof_ActiveTimers[type] += started_;
+    xprof_ActiveTimerCount++;
+    std::atomic_flag_clear_explicit(&xprof_globalLock, std::memory_order_release);
+
+    xprof_stats_data * head = &(xprof_Timers[type_]->accu);
+    while(std::atomic_flag_test_and_set_explicit(&head->lock, std::memory_order_acquire))
+         ; // spin until the list head lock is acquired
+    head->overheads += get_tick() - started_;
+    std::atomic_flag_clear_explicit(&head->lock, std::memory_order_release);
+}
+
+void
+XProfilerNode::stop()
+{
+    if (started_ < 0)
+        return;
+
+    hrtime_t stopped_ = get_tick();
+
+    // calculate how long this timer was running.
+    hrtime_t delta = stopped_ - started_;
+
+    // kick initialization of timers statistics (just in case)
+    xprof_InitLib();
+
+    // Record timer details into global counters
+
+    while(std::atomic_flag_test_and_set_explicit(&xprof_globalLock, std::memory_order_acquire))
+         ; // spin until the global lock is acquired
+    xprof_ActiveTimers[type_] -= started_;
+    xprof_ActiveTimerCount[type_]--;
+    std::atomic_flag_clear_explicit(&xprof_globalLock, std::memory_order_release);
+
+    xprof_stats_data * head = &(xprof_Timers[type_]->accu);
+    while(std::atomic_flag_test_and_set_explicit(&head->lock, std::memory_order_acquire))
+         ; // spin until the list head lock is acquired
+    if (delta < head->best)
+        head->best = delta;
+    if (delta > head->worst)
+        head->worst = delta;
+    head->summ += delta;
+    head->count++;
+    head->overheads += get_tick() - stopped_;
+    std::atomic_flag_clear_explicit(&head->lock, std::memory_order_release);
 }
 
 #endif /* USE_XPROF_STATS */

=== modified file 'lib/profiler/Profiler.h'
--- lib/profiler/Profiler.h	2011-01-11 07:31:04 +0000
+++ lib/profiler/Profiler.h	2012-06-07 00:33:42 +0000
@@ -18,6 +18,10 @@
 
 #else /* USE_XPROF_STATS */
 
+#if HAVE_ATOMIC
+#include <atomic>
+#endif
+
 #define XP_NOBEST (hrtime_t)-1
 
     typedef struct _xprof_stats_node xprof_stats_node;
@@ -25,13 +29,15 @@
     typedef struct _xprof_stats_data xprof_stats_data;
 
     struct _xprof_stats_data {
+        std::atomic_flag lock;
         hrtime_t start;
         hrtime_t stop;
-        hrtime_t delta;
+        hrtime_t delta; // unused?
         hrtime_t best;
         hrtime_t worst;
         hrtime_t count;
         hrtime_t accum;
+        int64_t overheads;
         int64_t summ;
     };
 
@@ -44,15 +50,42 @@
     typedef xprof_stats_node TimersArray[1];
 
     /* public Data */
+    /// timer statistics histograms
     extern TimersArray *xprof_Timers;
 
+    /// count of active timers
+    extern uint64_t *xprof_ActiveTimerCount;
+    /// currently running timer accumulations
+    extern uint64_t *xprof_ActiveTimers;
+
     /* Exported functions */
-    extern void xprof_start(xprof_type type, const char *timer);
-    extern void xprof_stop(xprof_type type, const char *timer);
     extern void xprof_event(void *data);
 
-#define PROF_start(probename) xprof_start(XPROF_##probename, #probename)
-#define PROF_stop(probename) xprof_stop(XPROF_##probename, #probename)
+/*
+/// This class provides a thread-safe timer for use in code profiling
+/// Timer kicks off when the class is creaated and terminates on destruct
+/// or if stop() method is called explicitly.
+///
+/// Use PROF_start(name) and PROF_stop(name) macros
+*/
+class XProfilerNode
+{
+public:
+    XProfilerNode(xprof_type aType, const char *aName) : type_(aType), name_(aName) { start(); }
+    ~XProfilerNode() { stop(); }
+
+    void start();
+    void stop();
+
+private:
+    xprof_type type_;     ///< which profile type to update when stopped
+    const char *name_;    ///< printable version of the timers type name
+    hrtime_t started_;    ///< when this timer was started
+};
+
+#define PROF_start(probename) XProfilerNode XProfTimer_##probename(XPROF_##probename, #probename)
+#define PROF_restart(probename) XProfTimer_##probename.start()
+#define PROF_stop(probename) XProfTimer_##probename.stop()
 
 #endif /* USE_XPROF_STATS */
 

=== modified file 'src/ProfStats.cc'
--- src/ProfStats.cc	2012-01-20 18:55:04 +0000
+++ src/ProfStats.cc	2012-06-07 01:24:08 +0000
@@ -46,12 +46,12 @@
 
 #define MAX_SORTLIST 200
 
+static hrtime_t now;
 static hrtime_t xprof_delta = 0;
 static hrtime_t xprof_start_t = 0;
 static hrtime_t xprof_verystart = 0;
 static hrtime_t xprof_average_delta = 0;
 static int xprof_events = 0;
-static int xprof_inited = 0;
 static xprof_stats_data Totals;
 
 static TimersArray *xprof_stats_avg1sec = NULL;
@@ -113,13 +113,14 @@
 xprof_show_item(StoreEntry * sentry, const char *name, xprof_stats_data * hist)
 {
     storeAppendPrintf(sentry,
-                      "%s\t %" PRIu64 "\t %" PRIu64 "\t %" PRIu64 "\t %" PRIu64 "\t %" PRIu64 "\t %.2f\t %6.3f\t\n",
+                      "%s\t %" PRIu64 "\t %" PRIu64 "\t %" PRIu64 "\t %" PRIu64 "\t %" PRIu64 "\t" PRIu64 "\t %.2f\t %6.3f\t\n",
                       name,
                       hist->count,
                       hist->summ,
                       (hist->best != XP_NOBEST ? hist->best : 0),
                       hist->count ? hist->summ / hist->count : 0,
                       hist->worst,
+                      hist->overheads,
                       hist->count / time_frame,
                       Math::doublePercent((double) hist->summ, (double) hist->delta));
 }
@@ -149,7 +150,7 @@
                      );
 
     storeAppendPrintf(sentry,
-                      "Probe Name\t  Events\t cumulated time \t best case \t average \t worst case\t Rate / sec \t %% in int\n");
+                      "Probe Name\t  Events\t cumulated time \t best case \t average \t worst case\toverheads\t Rate / sec \t %% in int\n");
 
     for (i = 0; i < XPROF_LAST; i++) {
         if (!hist[i]->name)
@@ -166,6 +167,8 @@
 
         Totals.summ += show->summ;
 
+        Totals.overheads += show->overheads;
+
         Totals.best += (show->best != XP_NOBEST ? show->best : 0);
 
         Totals.worst += show->worst;
@@ -194,7 +197,11 @@
     now = get_tick();
 
     for (i = 0; i < XPROF_LAST; i++) {
+        while(std::atomic_flag_test_and_set_explicit(&(head[i]->lock), std::memory_order_acquire))
+             ; // spin until the list record lock is acquired
+
         hist[i]->name = head[i]->name;
+        hist[i]->accu.overheads += head[i]->accu.overheads;
         hist[i]->accu.summ += head[i]->accu.summ;
         hist[i]->accu.count += head[i]->accu.count;	/* accumulate multisec */
 
@@ -225,6 +232,8 @@
             hist[i]->accu.delta = 0;
             head[i]->accu.start = keep;
         }
+
+        std::atomic_flag_clear_explicit(&(head[i]->lock), std::memory_order_release);
     }
 }
 
@@ -259,41 +268,37 @@
 static inline void
 xprof_chk_overhead(int samples)
 {
+    // NOTE: these profiler checks require both global and list xprof_* locks
+    PROF_start(PROF_OVERHEAD);
     while (samples--) {
-        PROF_start(PROF_OVERHEAD);
+        PROF_restart(PROF_OVERHEAD);
         PROF_stop(PROF_OVERHEAD);
     }
 }
 
 static void
-xprofRegisterWithCacheManager(void)
-{
-    Mgr::RegisterAction("cpu_profile", "CPU Profiling Stats", xprof_summary, 0, 1);
-}
-
-// FIXME:
-// this gets colled once per event. This doesn't seem to make much sense,
-// does it?
-static hrtime_t now;
-static void
 xprof_Init(void)
 {
+    static int xprof_inited = 0;
     if (xprof_inited)
         return;
 
-    xprof_delta = xprof_verystart = xprof_start_t = now;
-
+    xprof_delta = xprof_verystart = xprof_start_t = get_tick();
     xprof_inited = 1;
-
-    xprofRegisterWithCacheManager(); //moved here so it's not double-init'ed
+    Mgr::RegisterAction("cpu_profile", "CPU Profiling Stats", xprof_summary, 0, 1);
 }
 
 void
 xprof_event(void *data)
 {
-    now = get_tick();
+    hrtime_t now = get_tick();
     xprof_Init();
     xprof_delta = now - xprof_start_t;
+    xprof_chk_overhead(2);
+
+    while(std::atomic_flag_test_and_set_explicit(&xprof_globalLock, std::memory_order_acquire))
+         ; // spin until the global lock is acquired
+
     xprof_start_t = now;
     xprof_events++;
 
@@ -303,8 +308,6 @@
     if (xprof_average_delta > (xprof_delta >> 1))
         xprof_average_delta = xprof_average_delta - (xprof_average_delta >> 8) + (xprof_delta >> 8);
 
-    xprof_chk_overhead(2);
-
     xprof_average(&xprof_stats_avg24hour, 24 * 3600);
 
     xprof_average(&xprof_stats_avg5hour, 5 * 3600);
@@ -323,6 +326,8 @@
 
     xprof_average(&xprof_stats_avg1sec, 1);
 
+    std::atomic_flag_clear_explicit(&xprof_globalLock, std::memory_order_release);
+
     xprof_chk_overhead(30);
 
     eventAdd("cpuProfiling", xprof_event, NULL, 1.0, 1);

