47 #include <sys/x86_archext.h>
48
49 /*
50 * Implementation for cross-processor calls via interprocessor interrupts
51 *
52 * This implementation uses a message passing architecture to allow multiple
53 * concurrent cross calls to be in flight at any given time. We use the cmpxchg
54 * instruction, aka atomic_cas_ptr(), to implement simple efficient work
55 * queues for message passing between CPUs with almost no need for regular
56 * locking. See xc_extract() and xc_insert() below.
57 *
58 * The general idea is that initiating a cross call means putting a message
59 * on a target(s) CPU's work queue. Any synchronization is handled by passing
60 * the message back and forth between initiator and target(s).
61 *
62 * Every CPU has xc_work_cnt, which indicates it has messages to process.
63 * This value is incremented as message traffic is initiated and decremented
64 * with every message that finishes all processing.
65 *
66 * The code needs no mfence or other membar_*() calls. The uses of
67 * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
68 * passing are implemented with LOCK prefix instructions which are
69 * equivalent to mfence.
70 *
71 * One interesting aspect of this implmentation is that it allows 2 or more
72 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
73 * The cross call processing by the CPUs will happen in any order with only
74 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
75 * from cross calls before all slaves have invoked the function.
76 *
77 * The reason for this asynchronous approach is to allow for fast global
78 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
79 * on a different Virtual Address at the same time. The old code required
80 * N squared IPIs. With this method, depending on timing, it could happen
81 * with just N IPIs.
82 */
83
84 /*
85 * The default is to not enable collecting counts of IPI information, since
86 * the updating of shared cachelines could cause excess bus traffic.
87 */
125 * operations don't accept volatile bit vectors - which is a bit silly.
126 */
127 #define XC_BT_SET(vector, b) BT_ATOMIC_SET((ulong_t *)(vector), (b))
128 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
129
130 /*
131 * Decrement a CPU's work count
132 */
133 static void
134 xc_decrement(struct machcpu *mcpu)
135 {
136 atomic_dec_32(&mcpu->xc_work_cnt);
137 }
138
139 /*
140 * Increment a CPU's work count and return the old value
141 */
142 static int
143 xc_increment(struct machcpu *mcpu)
144 {
145 int old;
146 do {
147 old = mcpu->xc_work_cnt;
148 } while (atomic_cas_32(&mcpu->xc_work_cnt, old, old + 1) != old);
149 return (old);
150 }
151
152 /*
153 * Put a message into a queue. The insertion is atomic no matter
154 * how many different inserts/extracts to the same queue happen.
155 */
156 static void
157 xc_insert(void *queue, xc_msg_t *msg)
158 {
159 xc_msg_t *old_head;
160
161 /*
162 * FREE messages should only ever be getting inserted into
163 * the xc_master CPUs xc_free queue.
164 */
165 ASSERT(msg->xc_command != XC_MSG_FREE ||
166 cpu[msg->xc_master] == NULL || /* possible only during init */
167 queue == &cpu[msg->xc_master]->cpu_m.xc_free);
168
169 do {
|
47 #include <sys/x86_archext.h>
48
49 /*
50 * Implementation for cross-processor calls via interprocessor interrupts
51 *
52 * This implementation uses a message passing architecture to allow multiple
53 * concurrent cross calls to be in flight at any given time. We use the cmpxchg
54 * instruction, aka atomic_cas_ptr(), to implement simple efficient work
55 * queues for message passing between CPUs with almost no need for regular
56 * locking. See xc_extract() and xc_insert() below.
57 *
58 * The general idea is that initiating a cross call means putting a message
59 * on a target(s) CPU's work queue. Any synchronization is handled by passing
60 * the message back and forth between initiator and target(s).
61 *
62 * Every CPU has xc_work_cnt, which indicates it has messages to process.
63 * This value is incremented as message traffic is initiated and decremented
64 * with every message that finishes all processing.
65 *
66 * The code needs no mfence or other membar_*() calls. The uses of
67 * atomic_cas_ptr(), atomic_inc_32_nv() and atomic_dec_32() for the message
68 * passing are implemented with LOCK prefix instructions which are
69 * equivalent to mfence.
70 *
71 * One interesting aspect of this implmentation is that it allows 2 or more
72 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
73 * The cross call processing by the CPUs will happen in any order with only
74 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
75 * from cross calls before all slaves have invoked the function.
76 *
77 * The reason for this asynchronous approach is to allow for fast global
78 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
79 * on a different Virtual Address at the same time. The old code required
80 * N squared IPIs. With this method, depending on timing, it could happen
81 * with just N IPIs.
82 */
83
84 /*
85 * The default is to not enable collecting counts of IPI information, since
86 * the updating of shared cachelines could cause excess bus traffic.
87 */
125 * operations don't accept volatile bit vectors - which is a bit silly.
126 */
127 #define XC_BT_SET(vector, b) BT_ATOMIC_SET((ulong_t *)(vector), (b))
128 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
129
130 /*
131 * Decrement a CPU's work count
132 */
133 static void
134 xc_decrement(struct machcpu *mcpu)
135 {
136 atomic_dec_32(&mcpu->xc_work_cnt);
137 }
138
139 /*
140 * Increment a CPU's work count and return the old value
141 */
142 static int
143 xc_increment(struct machcpu *mcpu)
144 {
145 return (atomic_inc_32_nv(&mcpu->xc_work_cnt) - 1);
146 }
147
148 /*
149 * Put a message into a queue. The insertion is atomic no matter
150 * how many different inserts/extracts to the same queue happen.
151 */
152 static void
153 xc_insert(void *queue, xc_msg_t *msg)
154 {
155 xc_msg_t *old_head;
156
157 /*
158 * FREE messages should only ever be getting inserted into
159 * the xc_master CPUs xc_free queue.
160 */
161 ASSERT(msg->xc_command != XC_MSG_FREE ||
162 cpu[msg->xc_master] == NULL || /* possible only during init */
163 queue == &cpu[msg->xc_master]->cpu_m.xc_free);
164
165 do {
|