34 #include <sys/cpuvar.h>
35 #include <sys/x_call.h>
36 #include <sys/xc_levels.h>
37 #include <sys/cpu.h>
38 #include <sys/psw.h>
39 #include <sys/sunddi.h>
40 #include <sys/debug.h>
41 #include <sys/systm.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/mutex_impl.h>
45 #include <sys/stack.h>
46 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
48
49 /*
50 * Implementation for cross-processor calls via interprocessor interrupts
51 *
52 * This implementation uses a message passing architecture to allow multiple
53 * concurrent cross calls to be in flight at any given time. We use the cmpxchg
54 * instruction, aka casptr(), to implement simple efficient work queues for
55 * message passing between CPUs with almost no need for regular locking.
56 * See xc_extract() and xc_insert() below.
57 *
58 * The general idea is that initiating a cross call means putting a message
59 * on a target(s) CPU's work queue. Any synchronization is handled by passing
60 * the message back and forth between initiator and target(s).
61 *
62 * Every CPU has xc_work_cnt, which indicates it has messages to process.
63 * This value is incremented as message traffic is initiated and decremented
64 * with every message that finishes all processing.
65 *
66 * The code needs no mfence or other membar_*() calls. The uses of
67 * casptr(), cas32() and atomic_dec_32() for the message passing are
68 * implemented with LOCK prefix instructions which are equivalent to mfence.
69 *
70 * One interesting aspect of this implmentation is that it allows 2 or more
71 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
72 * The cross call processing by the CPUs will happen in any order with only
73 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
74 * from cross calls before all slaves have invoked the function.
75 *
76 * The reason for this asynchronous approach is to allow for fast global
77 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
78 * on a different Virtual Address at the same time. The old code required
79 * N squared IPIs. With this method, depending on timing, it could happen
80 * with just N IPIs.
81 */
82
83 /*
84 * The default is to not enable collecting counts of IPI information, since
85 * the updating of shared cachelines could cause excess bus traffic.
86 */
87 uint_t xc_collect_enable = 0;
88 uint64_t xc_total_cnt = 0; /* total #IPIs sent for cross calls */
127 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
128
129 /*
130 * Decrement a CPU's work count
131 */
132 static void
133 xc_decrement(struct machcpu *mcpu)
134 {
135 atomic_dec_32(&mcpu->xc_work_cnt);
136 }
137
138 /*
139 * Increment a CPU's work count and return the old value
140 */
141 static int
142 xc_increment(struct machcpu *mcpu)
143 {
144 int old;
145 do {
146 old = mcpu->xc_work_cnt;
147 } while (cas32((uint32_t *)&mcpu->xc_work_cnt, old, old + 1) != old);
148 return (old);
149 }
150
151 /*
152 * Put a message into a queue. The insertion is atomic no matter
153 * how many different inserts/extracts to the same queue happen.
154 */
155 static void
156 xc_insert(void *queue, xc_msg_t *msg)
157 {
158 xc_msg_t *old_head;
159
160 /*
161 * FREE messages should only ever be getting inserted into
162 * the xc_master CPUs xc_free queue.
163 */
164 ASSERT(msg->xc_command != XC_MSG_FREE ||
165 cpu[msg->xc_master] == NULL || /* possible only during init */
166 queue == &cpu[msg->xc_master]->cpu_m.xc_free);
167
168 do {
169 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
170 msg->xc_next = old_head;
171 } while (casptr(queue, old_head, msg) != old_head);
172 }
173
174 /*
175 * Extract a message from a queue. The extraction is atomic only
176 * when just one thread does extractions from the queue.
177 * If the queue is empty, NULL is returned.
178 */
179 static xc_msg_t *
180 xc_extract(xc_msg_t **queue)
181 {
182 xc_msg_t *old_head;
183
184 do {
185 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
186 if (old_head == NULL)
187 return (old_head);
188 } while (casptr(queue, old_head, old_head->xc_next) != old_head);
189 old_head->xc_next = NULL;
190 return (old_head);
191 }
192
193 /*
194 * Initialize the machcpu fields used for cross calls
195 */
196 static uint_t xc_initialized = 0;
197
198 void
199 xc_init_cpu(struct cpu *cpup)
200 {
201 xc_msg_t *msg;
202 int c;
203
204 /*
205 * Allocate message buffers for the new CPU.
206 */
207 for (c = 0; c < max_ncpus; ++c) {
208 if (plat_dr_support_cpu()) {
591 xc_priority_data.xc_func = func;
592 xc_priority_data.xc_a1 = arg1;
593 xc_priority_data.xc_a2 = arg2;
594 xc_priority_data.xc_a3 = arg3;
595
596 /*
597 * Post messages to all CPUs involved that are CPU_READY
598 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
599 */
600 for (c = 0; c < max_ncpus; ++c) {
601 if (!BT_TEST(set, c))
602 continue;
603 cpup = cpu[c];
604 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
605 cpup == CPU)
606 continue;
607 (void) xc_increment(&cpup->cpu_m);
608 XC_BT_SET(xc_priority_set, c);
609 send_dirint(c, XC_HI_PIL);
610 for (i = 0; i < 10; ++i) {
611 (void) casptr(&cpup->cpu_m.xc_msgbox,
612 cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
613 }
614 }
615 }
616
617 /*
618 * Do cross call to all other CPUs with absolutely no waiting or handshaking.
619 * This should only be used for extraordinary operations, like panic(), which
620 * need to work, in some fashion, in a not completely functional system.
621 * All other uses that want minimal waiting should use xc_call_nowait().
622 */
623 void
624 xc_priority(
625 xc_arg_t arg1,
626 xc_arg_t arg2,
627 xc_arg_t arg3,
628 ulong_t *set,
629 xc_func_t func)
630 {
631 extern int IGNORE_KERNEL_PREEMPTION;
|
34 #include <sys/cpuvar.h>
35 #include <sys/x_call.h>
36 #include <sys/xc_levels.h>
37 #include <sys/cpu.h>
38 #include <sys/psw.h>
39 #include <sys/sunddi.h>
40 #include <sys/debug.h>
41 #include <sys/systm.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/mutex_impl.h>
45 #include <sys/stack.h>
46 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
48
49 /*
50 * Implementation for cross-processor calls via interprocessor interrupts
51 *
52 * This implementation uses a message passing architecture to allow multiple
53 * concurrent cross calls to be in flight at any given time. We use the cmpxchg
54 * instruction, aka atomic_cas_ptr(), to implement simple efficient work
55 * queues for message passing between CPUs with almost no need for regular
56 * locking. See xc_extract() and xc_insert() below.
57 *
58 * The general idea is that initiating a cross call means putting a message
59 * on a target(s) CPU's work queue. Any synchronization is handled by passing
60 * the message back and forth between initiator and target(s).
61 *
62 * Every CPU has xc_work_cnt, which indicates it has messages to process.
63 * This value is incremented as message traffic is initiated and decremented
64 * with every message that finishes all processing.
65 *
66 * The code needs no mfence or other membar_*() calls. The uses of
67 * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
68 * passing are implemented with LOCK prefix instructions which are
69 * equivalent to mfence.
70 *
71 * One interesting aspect of this implmentation is that it allows 2 or more
72 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
73 * The cross call processing by the CPUs will happen in any order with only
74 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
75 * from cross calls before all slaves have invoked the function.
76 *
77 * The reason for this asynchronous approach is to allow for fast global
78 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
79 * on a different Virtual Address at the same time. The old code required
80 * N squared IPIs. With this method, depending on timing, it could happen
81 * with just N IPIs.
82 */
83
84 /*
85 * The default is to not enable collecting counts of IPI information, since
86 * the updating of shared cachelines could cause excess bus traffic.
87 */
88 uint_t xc_collect_enable = 0;
89 uint64_t xc_total_cnt = 0; /* total #IPIs sent for cross calls */
128 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
129
130 /*
131 * Decrement a CPU's work count
132 */
133 static void
134 xc_decrement(struct machcpu *mcpu)
135 {
136 atomic_dec_32(&mcpu->xc_work_cnt);
137 }
138
139 /*
140 * Increment a CPU's work count and return the old value
141 */
142 static int
143 xc_increment(struct machcpu *mcpu)
144 {
145 int old;
146 do {
147 old = mcpu->xc_work_cnt;
148 } while (atomic_cas_32(&mcpu->xc_work_cnt, old, old + 1) != old);
149 return (old);
150 }
151
152 /*
153 * Put a message into a queue. The insertion is atomic no matter
154 * how many different inserts/extracts to the same queue happen.
155 */
156 static void
157 xc_insert(void *queue, xc_msg_t *msg)
158 {
159 xc_msg_t *old_head;
160
161 /*
162 * FREE messages should only ever be getting inserted into
163 * the xc_master CPUs xc_free queue.
164 */
165 ASSERT(msg->xc_command != XC_MSG_FREE ||
166 cpu[msg->xc_master] == NULL || /* possible only during init */
167 queue == &cpu[msg->xc_master]->cpu_m.xc_free);
168
169 do {
170 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
171 msg->xc_next = old_head;
172 } while (atomic_cas_ptr(queue, old_head, msg) != old_head);
173 }
174
175 /*
176 * Extract a message from a queue. The extraction is atomic only
177 * when just one thread does extractions from the queue.
178 * If the queue is empty, NULL is returned.
179 */
180 static xc_msg_t *
181 xc_extract(xc_msg_t **queue)
182 {
183 xc_msg_t *old_head;
184
185 do {
186 old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
187 if (old_head == NULL)
188 return (old_head);
189 } while (atomic_cas_ptr(queue, old_head, old_head->xc_next) !=
190 old_head);
191 old_head->xc_next = NULL;
192 return (old_head);
193 }
194
195 /*
196 * Initialize the machcpu fields used for cross calls
197 */
198 static uint_t xc_initialized = 0;
199
200 void
201 xc_init_cpu(struct cpu *cpup)
202 {
203 xc_msg_t *msg;
204 int c;
205
206 /*
207 * Allocate message buffers for the new CPU.
208 */
209 for (c = 0; c < max_ncpus; ++c) {
210 if (plat_dr_support_cpu()) {
593 xc_priority_data.xc_func = func;
594 xc_priority_data.xc_a1 = arg1;
595 xc_priority_data.xc_a2 = arg2;
596 xc_priority_data.xc_a3 = arg3;
597
598 /*
599 * Post messages to all CPUs involved that are CPU_READY
600 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
601 */
602 for (c = 0; c < max_ncpus; ++c) {
603 if (!BT_TEST(set, c))
604 continue;
605 cpup = cpu[c];
606 if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
607 cpup == CPU)
608 continue;
609 (void) xc_increment(&cpup->cpu_m);
610 XC_BT_SET(xc_priority_set, c);
611 send_dirint(c, XC_HI_PIL);
612 for (i = 0; i < 10; ++i) {
613 (void) atomic_cas_ptr(&cpup->cpu_m.xc_msgbox,
614 cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
615 }
616 }
617 }
618
619 /*
620 * Do cross call to all other CPUs with absolutely no waiting or handshaking.
621 * This should only be used for extraordinary operations, like panic(), which
622 * need to work, in some fashion, in a not completely functional system.
623 * All other uses that want minimal waiting should use xc_call_nowait().
624 */
625 void
626 xc_priority(
627 xc_arg_t arg1,
628 xc_arg_t arg2,
629 xc_arg_t arg3,
630 ulong_t *set,
631 xc_func_t func)
632 {
633 extern int IGNORE_KERNEL_PREEMPTION;
|