Print this page
6138 don't abuse atomic_cas_*


  47 #include <sys/x86_archext.h>
  48 
  49 /*
  50  * Implementation for cross-processor calls via interprocessor interrupts
  51  *
  52  * This implementation uses a message passing architecture to allow multiple
  53  * concurrent cross calls to be in flight at any given time. We use the cmpxchg
  54  * instruction, aka atomic_cas_ptr(), to implement simple efficient work
  55  * queues for message passing between CPUs with almost no need for regular
  56  * locking.  See xc_extract() and xc_insert() below.
  57  *
  58  * The general idea is that initiating a cross call means putting a message
  59  * on a target(s) CPU's work queue. Any synchronization is handled by passing
  60  * the message back and forth between initiator and target(s).
  61  *
  62  * Every CPU has xc_work_cnt, which indicates it has messages to process.
  63  * This value is incremented as message traffic is initiated and decremented
  64  * with every message that finishes all processing.
  65  *
  66  * The code needs no mfence or other membar_*() calls. The uses of
  67  * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
  68  * passing are implemented with LOCK prefix instructions which are
  69  * equivalent to mfence.
  70  *
  71  * One interesting aspect of this implmentation is that it allows 2 or more
  72  * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
  73  * The cross call processing by the CPUs will happen in any order with only
  74  * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
  75  * from cross calls before all slaves have invoked the function.
  76  *
  77  * The reason for this asynchronous approach is to allow for fast global
  78  * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
  79  * on a different Virtual Address at the same time. The old code required
  80  * N squared IPIs. With this method, depending on timing, it could happen
  81  * with just N IPIs.
  82  */
  83 
  84 /*
  85  * The default is to not enable collecting counts of IPI information, since
  86  * the updating of shared cachelines could cause excess bus traffic.
  87  */


 125  * operations don't accept volatile bit vectors - which is a bit silly.
 126  */
 127 #define XC_BT_SET(vector, b)    BT_ATOMIC_SET((ulong_t *)(vector), (b))
 128 #define XC_BT_CLEAR(vector, b)  BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
 129 
 130 /*
 131  * Decrement a CPU's work count
 132  */
 133 static void
 134 xc_decrement(struct machcpu *mcpu)
 135 {
 136         atomic_dec_32(&mcpu->xc_work_cnt);
 137 }
 138 
 139 /*
 140  * Increment a CPU's work count and return the old value
 141  */
 142 static int
 143 xc_increment(struct machcpu *mcpu)
 144 {
 145         int old;
 146         do {
 147                 old = mcpu->xc_work_cnt;
 148         } while (atomic_cas_32(&mcpu->xc_work_cnt, old, old + 1) != old);
 149         return (old);
 150 }
 151 
 152 /*
 153  * Put a message into a queue. The insertion is atomic no matter
 154  * how many different inserts/extracts to the same queue happen.
 155  */
 156 static void
 157 xc_insert(void *queue, xc_msg_t *msg)
 158 {
 159         xc_msg_t *old_head;
 160 
 161         /*
 162          * FREE messages should only ever be getting inserted into
 163          * the xc_master CPUs xc_free queue.
 164          */
 165         ASSERT(msg->xc_command != XC_MSG_FREE ||
 166             cpu[msg->xc_master] == NULL || /* possible only during init */
 167             queue == &cpu[msg->xc_master]->cpu_m.xc_free);
 168 
 169         do {




  47 #include <sys/x86_archext.h>
  48 
  49 /*
  50  * Implementation for cross-processor calls via interprocessor interrupts
  51  *
  52  * This implementation uses a message passing architecture to allow multiple
  53  * concurrent cross calls to be in flight at any given time. We use the cmpxchg
  54  * instruction, aka atomic_cas_ptr(), to implement simple efficient work
  55  * queues for message passing between CPUs with almost no need for regular
  56  * locking.  See xc_extract() and xc_insert() below.
  57  *
  58  * The general idea is that initiating a cross call means putting a message
  59  * on a target(s) CPU's work queue. Any synchronization is handled by passing
  60  * the message back and forth between initiator and target(s).
  61  *
  62  * Every CPU has xc_work_cnt, which indicates it has messages to process.
  63  * This value is incremented as message traffic is initiated and decremented
  64  * with every message that finishes all processing.
  65  *
  66  * The code needs no mfence or other membar_*() calls. The uses of
  67  * atomic_cas_ptr(), atomic_inc_32_nv() and atomic_dec_32() for the message
  68  * passing are implemented with LOCK prefix instructions which are
  69  * equivalent to mfence.
  70  *
  71  * One interesting aspect of this implmentation is that it allows 2 or more
  72  * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
  73  * The cross call processing by the CPUs will happen in any order with only
  74  * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
  75  * from cross calls before all slaves have invoked the function.
  76  *
  77  * The reason for this asynchronous approach is to allow for fast global
  78  * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
  79  * on a different Virtual Address at the same time. The old code required
  80  * N squared IPIs. With this method, depending on timing, it could happen
  81  * with just N IPIs.
  82  */
  83 
  84 /*
  85  * The default is to not enable collecting counts of IPI information, since
  86  * the updating of shared cachelines could cause excess bus traffic.
  87  */


 125  * operations don't accept volatile bit vectors - which is a bit silly.
 126  */
 127 #define XC_BT_SET(vector, b)    BT_ATOMIC_SET((ulong_t *)(vector), (b))
 128 #define XC_BT_CLEAR(vector, b)  BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
 129 
 130 /*
 131  * Decrement a CPU's work count
 132  */
 133 static void
 134 xc_decrement(struct machcpu *mcpu)
 135 {
 136         atomic_dec_32(&mcpu->xc_work_cnt);
 137 }
 138 
 139 /*
 140  * Increment a CPU's work count and return the old value
 141  */
 142 static int
 143 xc_increment(struct machcpu *mcpu)
 144 {
 145         return (atomic_inc_32_nv(&mcpu->xc_work_cnt) - 1);




 146 }
 147 
 148 /*
 149  * Put a message into a queue. The insertion is atomic no matter
 150  * how many different inserts/extracts to the same queue happen.
 151  */
 152 static void
 153 xc_insert(void *queue, xc_msg_t *msg)
 154 {
 155         xc_msg_t *old_head;
 156 
 157         /*
 158          * FREE messages should only ever be getting inserted into
 159          * the xc_master CPUs xc_free queue.
 160          */
 161         ASSERT(msg->xc_command != XC_MSG_FREE ||
 162             cpu[msg->xc_master] == NULL || /* possible only during init */
 163             queue == &cpu[msg->xc_master]->cpu_m.xc_free);
 164 
 165         do {