Print this page
5045 use atomic_{inc,dec}_* instead of atomic_add_*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/ip/ip_ire.c
+++ new/usr/src/uts/common/inet/ip/ip_ire.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 1990 Mentat Inc.
24 24 */
25 25
26 26 /*
27 27 * This file contains routines that manipulate Internet Routing Entries (IREs).
28 28 */
29 29
30 30 #include <sys/types.h>
31 31 #include <sys/stream.h>
32 32 #include <sys/stropts.h>
33 33 #include <sys/strsun.h>
34 34 #include <sys/strsubr.h>
35 35 #include <sys/ddi.h>
36 36 #include <sys/cmn_err.h>
37 37 #include <sys/policy.h>
38 38
39 39 #include <sys/systm.h>
40 40 #include <sys/kmem.h>
41 41 #include <sys/param.h>
42 42 #include <sys/socket.h>
43 43 #include <net/if.h>
44 44 #include <net/route.h>
45 45 #include <netinet/in.h>
46 46 #include <net/if_dl.h>
47 47 #include <netinet/ip6.h>
48 48 #include <netinet/icmp6.h>
49 49
50 50 #include <inet/common.h>
51 51 #include <inet/mi.h>
52 52 #include <inet/ip.h>
53 53 #include <inet/ip6.h>
54 54 #include <inet/ip_ndp.h>
55 55 #include <inet/arp.h>
56 56 #include <inet/ip_if.h>
57 57 #include <inet/ip_ire.h>
58 58 #include <inet/ip_ftable.h>
59 59 #include <inet/ip_rts.h>
60 60 #include <inet/nd.h>
61 61 #include <inet/tunables.h>
62 62
63 63 #include <inet/tcp.h>
64 64 #include <inet/ipclassifier.h>
65 65 #include <sys/zone.h>
66 66 #include <sys/cpuvar.h>
67 67
68 68 #include <sys/tsol/label.h>
69 69 #include <sys/tsol/tnet.h>
70 70
71 71 struct kmem_cache *rt_entry_cache;
72 72
73 73 typedef struct nce_clookup_s {
74 74 ipaddr_t ncecl_addr;
75 75 boolean_t ncecl_found;
76 76 } nce_clookup_t;
77 77
78 78 /*
79 79 * Synchronization notes:
80 80 *
81 81 * The fields of the ire_t struct are protected in the following way :
82 82 *
83 83 * ire_next/ire_ptpn
84 84 *
85 85 * - bucket lock of the forwarding table in which is ire stored.
86 86 *
87 87 * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask,
88 88 * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags,
89 89 * ire_bucket
90 90 *
91 91 * - Set in ire_create_v4/v6 and never changes after that. Thus,
92 92 * we don't need a lock whenever these fields are accessed.
93 93 *
94 94 * - ire_bucket and ire_masklen (also set in ire_create) is set in
95 95 * ire_add before inserting in the bucket and never
96 96 * changes after that. Thus we don't need a lock whenever these
97 97 * fields are accessed.
98 98 *
99 99 * ire_gateway_addr_v4[v6]
100 100 *
101 101 * - ire_gateway_addr_v4[v6] is set during ire_create and later modified
102 102 * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to
103 103 * it assumed to be atomic and hence the other parts of the code
104 104 * does not use any locks. ire_gateway_addr_v6 updates are not atomic
105 105 * and hence any access to it uses ire_lock to get/set the right value.
106 106 *
107 107 * ire_refcnt, ire_identical_ref
108 108 *
109 109 * - Updated atomically using atomic_add_32
110 110 *
111 111 * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count
112 112 *
113 113 * - Assumes that 32 bit writes are atomic. No locks. ire_lock is
114 114 * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
115 115 *
116 116 * ire_generation
117 117 * - Under ire_lock
118 118 *
119 119 * ire_nce_cache
120 120 * - Under ire_lock
121 121 *
122 122 * ire_dep_parent (To next IRE in recursive lookup chain)
123 123 * - Under ips_ire_dep_lock. Write held when modifying. Read held when
124 124 * walking. We also hold ire_lock when modifying to allow the data path
125 125 * to only acquire ire_lock.
126 126 *
127 127 * ire_dep_parent_generation (Generation number from ire_dep_parent)
128 128 * - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock
129 129 * and ire_lock held when modifying)
130 130 *
131 131 * ire_dep_children (From parent to first child)
132 132 * ire_dep_sib_next (linked list of siblings)
133 133 * ire_dep_sib_ptpn (linked list of siblings)
134 134 * - Under ips_ire_dep_lock. Write held when modifying. Read held when
135 135 * walking.
136 136 *
137 137 * As we always hold the bucket locks in all the places while accessing
138 138 * the above values, it is natural to use them for protecting them.
139 139 *
140 140 * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table
141 141 * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
142 142 * structures. ip_forwarding_table_v6 is allocated dynamically in
143 143 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
144 144 * initializing the same bucket. Once a bucket is initialized, it is never
145 145 * de-alloacted. This assumption enables us to access
146 146 * ip_forwarding_table_v6[i] without any locks.
147 147 *
148 148 * The forwarding table for IPv4 is a radix tree whose leaves
149 149 * are rt_entry structures containing the irb_t for the rt_dst. The irb_t
150 150 * for IPv4 is dynamically allocated and freed.
151 151 *
152 152 * Each irb_t - ire bucket structure has a lock to protect
153 153 * a bucket and the ires residing in the bucket have a back pointer to
154 154 * the bucket structure. It also has a reference count for the number
155 155 * of threads walking the bucket - irb_refcnt which is bumped up
156 156 * using the irb_refhold function. The flags irb_marks can be
157 157 * set to IRB_MARK_CONDEMNED indicating that there are some ires
158 158 * in this bucket that are IRE_IS_CONDEMNED and the
159 159 * last thread to leave the bucket should delete the ires. Usually
160 160 * this is done by the irb_refrele function which is used to decrement
161 161 * the reference count on a bucket. See comments above irb_t structure
162 162 * definition in ip.h for further details.
163 163 *
164 164 * The ire_refhold/ire_refrele functions operate on the ire which increments/
165 165 * decrements the reference count, ire_refcnt, atomically on the ire.
166 166 * ire_refcnt is modified only using those functions. Operations on the IRE
167 167 * could be described as follows :
168 168 *
169 169 * CREATE an ire with reference count initialized to 1.
170 170 *
171 171 * ADDITION of an ire holds the bucket lock, checks for duplicates
172 172 * and then adds the ire. ire_add returns the ire after
173 173 * bumping up once more i.e the reference count is 2. This is to avoid
174 174 * an extra lookup in the functions calling ire_add which wants to
175 175 * work with the ire after adding.
176 176 *
177 177 * LOOKUP of an ire bumps up the reference count using ire_refhold
178 178 * function. It is valid to bump up the referece count of the IRE,
179 179 * after the lookup has returned an ire. Following are the lookup
180 180 * functions that return an HELD ire :
181 181 *
182 182 * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6]
183 183 *
184 184 * DELETION of an ire holds the bucket lock, removes it from the list
185 185 * and then decrements the reference count for having removed from the list
186 186 * by using the ire_refrele function. If some other thread has looked up
187 187 * the ire, the reference count would have been bumped up and hence
188 188 * this ire will not be freed once deleted. It will be freed once the
189 189 * reference count drops to zero.
190 190 *
191 191 * Add and Delete acquires the bucket lock as RW_WRITER, while all the
192 192 * lookups acquire the bucket lock as RW_READER.
193 193 *
194 194 * The general rule is to do the ire_refrele in the function
195 195 * that is passing the ire as an argument.
196 196 *
197 197 * In trying to locate ires the following points are to be noted.
198 198 *
199 199 * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is
200 200 * to be ignored when walking the ires using ire_next.
201 201 *
202 202 * Zones note:
203 203 * Walking IREs within a given zone also walks certain ires in other
204 204 * zones. This is done intentionally. IRE walks with a specified
205 205 * zoneid are used only when doing informational reports, and
206 206 * zone users want to see things that they can access. See block
207 207 * comment in ire_walk_ill_match().
208 208 */
209 209
210 210 /*
211 211 * The size of the forwarding table. We will make sure that it is a
212 212 * power of 2 in ip_ire_init().
213 213 * Setable in /etc/system
214 214 */
215 215 uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
216 216
217 217 struct kmem_cache *ire_cache;
218 218 struct kmem_cache *ncec_cache;
219 219 struct kmem_cache *nce_cache;
220 220
221 221 static ire_t ire_null;
222 222
223 223 static ire_t *ire_add_v4(ire_t *ire);
224 224 static void ire_delete_v4(ire_t *ire);
225 225 static void ire_dep_invalidate_children(ire_t *child);
226 226 static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
227 227 zoneid_t zoneid, ip_stack_t *);
228 228 static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
229 229 pfv_t func, void *arg, uchar_t vers, ill_t *ill);
230 230 #ifdef DEBUG
231 231 static void ire_trace_cleanup(const ire_t *);
232 232 #endif
233 233 static void ire_dep_incr_generation_locked(ire_t *);
234 234
235 235 /*
236 236 * Following are the functions to increment/decrement the reference
237 237 * count of the IREs and IRBs (ire bucket).
238 238 *
239 239 * 1) We bump up the reference count of an IRE to make sure that
240 240 * it does not get deleted and freed while we are using it.
241 241 * Typically all the lookup functions hold the bucket lock,
242 242 * and look for the IRE. If it finds an IRE, it bumps up the
243 243 * reference count before dropping the lock. Sometimes we *may* want
244 244 * to bump up the reference count after we *looked* up i.e without
245 245 * holding the bucket lock. So, the ire_refhold function does not assert
246 246 * on the bucket lock being held. Any thread trying to delete from
247 247 * the hash bucket can still do so but cannot free the IRE if
248 248 * ire_refcnt is not 0.
249 249 *
250 250 * 2) We bump up the reference count on the bucket where the IRE resides
251 251 * (IRB), when we want to prevent the IREs getting deleted from a given
252 252 * hash bucket. This makes life easier for ire_walk type functions which
253 253 * wants to walk the IRE list, call a function, but needs to drop
254 254 * the bucket lock to prevent recursive rw_enters. While the
255 255 * lock is dropped, the list could be changed by other threads or
256 256 * the same thread could end up deleting the ire or the ire pointed by
257 257 * ire_next. ire_refholding the ire or ire_next is not sufficient as
258 258 * a delete will still remove the ire from the bucket while we have
259 259 * dropped the lock and hence the ire_next would be NULL. Thus, we
260 260 * need a mechanism to prevent deletions from a given bucket.
261 261 *
262 262 * To prevent deletions, we bump up the reference count on the
263 263 * bucket. If the bucket is held, ire_delete just marks both
264 264 * the ire and irb as CONDEMNED. When the
265 265 * reference count on the bucket drops to zero, all the CONDEMNED ires
266 266 * are deleted. We don't have to bump up the reference count on the
267 267 * bucket if we are walking the bucket and never have to drop the bucket
268 268 * lock. Note that irb_refhold does not prevent addition of new ires
269 269 * in the list. It is okay because addition of new ires will not cause
270 270 * ire_next to point to freed memory. We do irb_refhold only when
271 271 * all of the 3 conditions are true :
272 272 *
273 273 * 1) The code needs to walk the IRE bucket from start to end.
274 274 * 2) It may have to drop the bucket lock sometimes while doing (1)
275 275 * 3) It does not want any ires to be deleted meanwhile.
276 276 */
277 277
278 278 /*
279 279 * Bump up the reference count on the hash bucket - IRB to
280 280 * prevent ires from being deleted in this bucket.
281 281 */
282 282 void
283 283 irb_refhold(irb_t *irb)
284 284 {
285 285 rw_enter(&irb->irb_lock, RW_WRITER);
286 286 irb->irb_refcnt++;
287 287 ASSERT(irb->irb_refcnt != 0);
288 288 rw_exit(&irb->irb_lock);
289 289 }
290 290
291 291 void
292 292 irb_refhold_locked(irb_t *irb)
293 293 {
294 294 ASSERT(RW_WRITE_HELD(&irb->irb_lock));
295 295 irb->irb_refcnt++;
296 296 ASSERT(irb->irb_refcnt != 0);
297 297 }
298 298
299 299 /*
300 300 * Note: when IRB_MARK_DYNAMIC is not set the irb_t
301 301 * is statically allocated, so that when the irb_refcnt goes to 0,
302 302 * we simply clean up the ire list and continue.
303 303 */
304 304 void
305 305 irb_refrele(irb_t *irb)
306 306 {
307 307 if (irb->irb_marks & IRB_MARK_DYNAMIC) {
308 308 irb_refrele_ftable(irb);
309 309 } else {
310 310 rw_enter(&irb->irb_lock, RW_WRITER);
311 311 ASSERT(irb->irb_refcnt != 0);
312 312 if (--irb->irb_refcnt == 0 &&
313 313 (irb->irb_marks & IRB_MARK_CONDEMNED)) {
314 314 ire_t *ire_list;
315 315
316 316 ire_list = ire_unlink(irb);
317 317 rw_exit(&irb->irb_lock);
318 318 ASSERT(ire_list != NULL);
319 319 ire_cleanup(ire_list);
320 320 } else {
321 321 rw_exit(&irb->irb_lock);
322 322 }
323 323 }
324 324 }
325 325
↓ open down ↓ |
325 lines elided |
↑ open up ↑ |
326 326
327 327 /*
328 328 * Bump up the reference count on the IRE. We cannot assert that the
329 329 * bucket lock is being held as it is legal to bump up the reference
330 330 * count after the first lookup has returned the IRE without
331 331 * holding the lock.
332 332 */
333 333 void
334 334 ire_refhold(ire_t *ire)
335 335 {
336 - atomic_add_32(&(ire)->ire_refcnt, 1);
336 + atomic_inc_32(&(ire)->ire_refcnt);
337 337 ASSERT((ire)->ire_refcnt != 0);
338 338 #ifdef DEBUG
339 339 ire_trace_ref(ire);
340 340 #endif
341 341 }
342 342
343 343 void
344 344 ire_refhold_notr(ire_t *ire)
345 345 {
346 - atomic_add_32(&(ire)->ire_refcnt, 1);
346 + atomic_inc_32(&(ire)->ire_refcnt);
347 347 ASSERT((ire)->ire_refcnt != 0);
348 348 }
349 349
350 350 void
351 351 ire_refhold_locked(ire_t *ire)
352 352 {
353 353 #ifdef DEBUG
354 354 ire_trace_ref(ire);
355 355 #endif
356 356 ire->ire_refcnt++;
357 357 }
358 358
359 359 /*
360 360 * Release a ref on an IRE.
361 361 *
362 362 * Must not be called while holding any locks. Otherwise if this is
363 363 * the last reference to be released there is a chance of recursive mutex
364 364 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
365 365 * to restart an ioctl. The one exception is when the caller is sure that
366 366 * this is not the last reference to be released. Eg. if the caller is
367 367 * sure that the ire has not been deleted and won't be deleted.
368 368 *
369 369 * In architectures e.g sun4u, where atomic_add_32_nv is just
370 370 * a cas, we need to maintain the right memory barrier semantics
371 371 * as that of mutex_exit i.e all the loads and stores should complete
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
372 372 * before the cas is executed. membar_exit() does that here.
373 373 */
374 374 void
375 375 ire_refrele(ire_t *ire)
376 376 {
377 377 #ifdef DEBUG
378 378 ire_untrace_ref(ire);
379 379 #endif
380 380 ASSERT((ire)->ire_refcnt != 0);
381 381 membar_exit();
382 - if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
382 + if (atomic_dec_32_nv(&(ire)->ire_refcnt) == 0)
383 383 ire_inactive(ire);
384 384 }
385 385
386 386 void
387 387 ire_refrele_notr(ire_t *ire)
388 388 {
389 389 ASSERT((ire)->ire_refcnt != 0);
390 390 membar_exit();
391 - if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
391 + if (atomic_dec_32_nv(&(ire)->ire_refcnt) == 0)
392 392 ire_inactive(ire);
393 393 }
394 394
395 395 /*
396 396 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
397 397 * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is
398 398 * having problems reaching a particular destination.
399 399 * This will make IP consider alternate routes (e.g., when there are
400 400 * muliple default routes), and it will also make IP discard any (potentially)
401 401 * stale redirect.
402 402 * Management processes may want to use the version that generates a reply.
403 403 *
404 404 * With the use of NUD like behavior for IPv4/ARP in addition to IPv6
405 405 * this function shouldn't be necessary for IP to recover from a bad redirect,
406 406 * a bad default router (when there are multiple default routers), or
407 407 * a stale ND/ARP entry. But we retain it in any case.
408 408 * For instance, this is helpful when TCP suspects a failure before NUD does.
409 409 */
410 410 int
411 411 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
412 412 {
413 413 uchar_t *addr_ucp;
414 414 uint_t ipversion;
415 415 sin_t *sin;
416 416 sin6_t *sin6;
417 417 ipaddr_t v4addr;
418 418 in6_addr_t v6addr;
419 419 ire_t *ire;
420 420 ipid_t *ipid;
421 421 zoneid_t zoneid;
422 422 ip_stack_t *ipst;
423 423
424 424 ASSERT(q->q_next == NULL);
425 425 zoneid = IPCL_ZONEID(Q_TO_CONN(q));
426 426 ipst = CONNQ_TO_IPST(q);
427 427
428 428 /*
429 429 * Check privilege using the ioctl credential; if it is NULL
430 430 * then this is a kernel message and therefor privileged.
431 431 */
432 432 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
433 433 return (EPERM);
434 434
435 435 ipid = (ipid_t *)mp->b_rptr;
436 436
437 437 addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
438 438 ipid->ipid_addr_length);
439 439 if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
440 440 return (EINVAL);
441 441 switch (ipid->ipid_addr_length) {
442 442 case sizeof (sin_t):
443 443 /*
444 444 * got complete (sockaddr) address - increment addr_ucp to point
445 445 * at the ip_addr field.
446 446 */
447 447 sin = (sin_t *)addr_ucp;
448 448 addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
449 449 ipversion = IPV4_VERSION;
450 450 break;
451 451 case sizeof (sin6_t):
452 452 /*
453 453 * got complete (sockaddr) address - increment addr_ucp to point
454 454 * at the ip_addr field.
455 455 */
456 456 sin6 = (sin6_t *)addr_ucp;
457 457 addr_ucp = (uchar_t *)&sin6->sin6_addr;
458 458 ipversion = IPV6_VERSION;
459 459 break;
460 460 default:
461 461 return (EINVAL);
462 462 }
463 463 if (ipversion == IPV4_VERSION) {
464 464 /* Extract the destination address. */
465 465 bcopy(addr_ucp, &v4addr, IP_ADDR_LEN);
466 466
467 467 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
468 468 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
469 469 } else {
470 470 /* Extract the destination address. */
471 471 bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN);
472 472
473 473 ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL,
474 474 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
475 475 }
476 476 if (ire != NULL) {
477 477 if (ipversion == IPV4_VERSION) {
478 478 ip_rts_change(RTM_LOSING, ire->ire_addr,
479 479 ire->ire_gateway_addr, ire->ire_mask,
480 480 (Q_TO_CONN(q))->conn_laddr_v4, 0, 0, 0,
481 481 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
482 482 ire->ire_ipst);
483 483 }
484 484 (void) ire_no_good(ire);
485 485 ire_refrele(ire);
486 486 }
487 487 return (0);
488 488 }
489 489
490 490 /*
491 491 * Initialize the ire that is specific to IPv4 part and call
492 492 * ire_init_common to finish it.
493 493 * Returns zero or errno.
494 494 */
495 495 int
496 496 ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway,
497 497 ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags,
498 498 tsol_gc_t *gc, ip_stack_t *ipst)
499 499 {
500 500 int error;
501 501
502 502 /*
503 503 * Reject IRE security attribute creation/initialization
504 504 * if system is not running in Trusted mode.
505 505 */
506 506 if (gc != NULL && !is_system_labeled())
507 507 return (EINVAL);
508 508
509 509 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
510 510
511 511 if (addr != NULL)
512 512 bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
513 513 if (gateway != NULL)
514 514 bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
515 515
516 516 /* Make sure we don't have stray values in some fields */
517 517 switch (type) {
518 518 case IRE_LOOPBACK:
519 519 case IRE_HOST:
520 520 case IRE_BROADCAST:
521 521 case IRE_LOCAL:
522 522 case IRE_IF_CLONE:
523 523 ire->ire_mask = IP_HOST_MASK;
524 524 ire->ire_masklen = IPV4_ABITS;
525 525 break;
526 526 case IRE_PREFIX:
527 527 case IRE_DEFAULT:
528 528 case IRE_IF_RESOLVER:
529 529 case IRE_IF_NORESOLVER:
530 530 if (mask != NULL) {
531 531 bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
532 532 ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
533 533 }
534 534 break;
535 535 case IRE_MULTICAST:
536 536 case IRE_NOROUTE:
537 537 ASSERT(mask == NULL);
538 538 break;
539 539 default:
540 540 ASSERT(0);
541 541 return (EINVAL);
542 542 }
543 543
544 544 error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION,
545 545 gc, ipst);
546 546 if (error != NULL)
547 547 return (error);
548 548
549 549 /* Determine which function pointers to use */
550 550 ire->ire_postfragfn = ip_xmit; /* Common case */
551 551
552 552 switch (ire->ire_type) {
553 553 case IRE_LOCAL:
554 554 ire->ire_sendfn = ire_send_local_v4;
555 555 ire->ire_recvfn = ire_recv_local_v4;
556 556 ASSERT(ire->ire_ill != NULL);
557 557 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
558 558 ire->ire_recvfn = ire_recv_noaccept_v6;
559 559 break;
560 560 case IRE_LOOPBACK:
561 561 ire->ire_sendfn = ire_send_local_v4;
562 562 ire->ire_recvfn = ire_recv_loopback_v4;
563 563 break;
564 564 case IRE_BROADCAST:
565 565 ire->ire_postfragfn = ip_postfrag_loopcheck;
566 566 ire->ire_sendfn = ire_send_broadcast_v4;
567 567 ire->ire_recvfn = ire_recv_broadcast_v4;
568 568 break;
569 569 case IRE_MULTICAST:
570 570 ire->ire_postfragfn = ip_postfrag_loopcheck;
571 571 ire->ire_sendfn = ire_send_multicast_v4;
572 572 ire->ire_recvfn = ire_recv_multicast_v4;
573 573 break;
574 574 default:
575 575 /*
576 576 * For IRE_IF_ALL and IRE_OFFLINK we forward received
577 577 * packets by default.
578 578 */
579 579 ire->ire_sendfn = ire_send_wire_v4;
580 580 ire->ire_recvfn = ire_recv_forward_v4;
581 581 break;
582 582 }
583 583 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
584 584 ire->ire_sendfn = ire_send_noroute_v4;
585 585 ire->ire_recvfn = ire_recv_noroute_v4;
586 586 } else if (ire->ire_flags & RTF_MULTIRT) {
587 587 ire->ire_postfragfn = ip_postfrag_multirt_v4;
588 588 ire->ire_sendfn = ire_send_multirt_v4;
589 589 /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */
590 590 if (ire->ire_type != IRE_BROADCAST)
591 591 ire->ire_recvfn = ire_recv_multirt_v4;
592 592 }
593 593 ire->ire_nce_capable = ire_determine_nce_capable(ire);
594 594 return (0);
595 595 }
596 596
597 597 /*
598 598 * Determine ire_nce_capable
599 599 */
600 600 boolean_t
601 601 ire_determine_nce_capable(ire_t *ire)
602 602 {
603 603 int max_masklen;
604 604
605 605 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
606 606 (ire->ire_type & IRE_MULTICAST))
607 607 return (B_TRUE);
608 608
609 609 if (ire->ire_ipversion == IPV4_VERSION)
610 610 max_masklen = IPV4_ABITS;
611 611 else
612 612 max_masklen = IPV6_ABITS;
613 613
614 614 if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen)
615 615 return (B_TRUE);
616 616 return (B_FALSE);
617 617 }
618 618
619 619 /*
620 620 * ire_create is called to allocate and initialize a new IRE.
621 621 *
622 622 * NOTE : This is called as writer sometimes though not required
623 623 * by this function.
624 624 */
625 625 ire_t *
626 626 ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway,
627 627 ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc,
628 628 ip_stack_t *ipst)
629 629 {
630 630 ire_t *ire;
631 631 int error;
632 632
633 633 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
634 634 if (ire == NULL) {
635 635 DTRACE_PROBE(kmem__cache__alloc);
636 636 return (NULL);
637 637 }
638 638 *ire = ire_null;
639 639
640 640 error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags,
641 641 gc, ipst);
642 642 if (error != 0) {
643 643 DTRACE_PROBE2(ire__init, ire_t *, ire, int, error);
644 644 kmem_cache_free(ire_cache, ire);
645 645 return (NULL);
646 646 }
647 647 return (ire);
648 648 }
649 649
650 650 /*
651 651 * Common to IPv4 and IPv6
652 652 * Returns zero or errno.
653 653 */
654 654 int
655 655 ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid,
656 656 uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst)
657 657 {
658 658 int error;
659 659
660 660 #ifdef DEBUG
661 661 if (ill != NULL) {
662 662 if (ill->ill_isv6)
663 663 ASSERT(ipversion == IPV6_VERSION);
664 664 else
665 665 ASSERT(ipversion == IPV4_VERSION);
666 666 }
667 667 #endif /* DEBUG */
668 668
669 669 /*
670 670 * Create/initialize IRE security attribute only in Trusted mode;
671 671 * if the passed in gc is non-NULL, we expect that the caller
672 672 * has held a reference to it and will release it when this routine
673 673 * returns a failure, otherwise we own the reference. We do this
674 674 * prior to initializing the rest IRE fields.
675 675 */
676 676 if (is_system_labeled()) {
677 677 if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
678 678 IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) {
679 679 /* release references on behalf of caller */
680 680 if (gc != NULL)
681 681 GC_REFRELE(gc);
682 682 } else {
683 683 error = tsol_ire_init_gwattr(ire, ipversion, gc);
684 684 if (error != 0)
685 685 return (error);
686 686 }
687 687 }
688 688
689 689 ire->ire_type = type;
690 690 ire->ire_flags = RTF_UP | flags;
691 691 ire->ire_create_time = (uint32_t)gethrestime_sec();
692 692 ire->ire_generation = IRE_GENERATION_INITIAL;
693 693
694 694 /*
695 695 * The ill_ire_cnt isn't increased until
696 696 * the IRE is added to ensure that a walker will find
697 697 * all IREs that hold a reference on an ill.
698 698 *
699 699 * Note that ill_ire_multicast doesn't hold a ref on the ill since
700 700 * ire_add() is not called for the IRE_MULTICAST.
701 701 */
702 702 ire->ire_ill = ill;
703 703 ire->ire_zoneid = zoneid;
704 704 ire->ire_ipversion = ipversion;
705 705
706 706 mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
707 707 ire->ire_refcnt = 1;
708 708 ire->ire_identical_ref = 1; /* Number of ire_delete's needed */
709 709 ire->ire_ipst = ipst; /* No netstack_hold */
710 710 ire->ire_trace_disable = B_FALSE;
711 711
712 712 return (0);
713 713 }
714 714
715 715 /*
716 716 * This creates an IRE_BROADCAST based on the arguments.
717 717 * A mirror is ire_lookup_bcast().
718 718 *
719 719 * Any supression of unneeded ones is done in ire_add_v4.
720 720 * We add one IRE_BROADCAST per address. ire_send_broadcast_v4()
721 721 * takes care of generating a loopback copy of the packet.
722 722 */
723 723 ire_t **
724 724 ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep)
725 725 {
726 726 ip_stack_t *ipst = ill->ill_ipst;
727 727
728 728 ASSERT(IAM_WRITER_ILL(ill));
729 729
730 730 *irep++ = ire_create(
731 731 (uchar_t *)&addr, /* dest addr */
732 732 (uchar_t *)&ip_g_all_ones, /* mask */
733 733 NULL, /* no gateway */
734 734 IRE_BROADCAST,
735 735 ill,
736 736 zoneid,
737 737 RTF_KERNEL,
738 738 NULL,
739 739 ipst);
740 740
741 741 return (irep);
742 742 }
743 743
744 744 /*
745 745 * This looks up an IRE_BROADCAST based on the arguments.
746 746 * Mirrors ire_create_bcast().
747 747 */
748 748 ire_t *
749 749 ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
750 750 {
751 751 ire_t *ire;
752 752 int match_args;
753 753
754 754 match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW |
755 755 MATCH_IRE_MASK | MATCH_IRE_ZONEONLY;
756 756
757 757 if (IS_UNDER_IPMP(ill))
758 758 match_args |= MATCH_IRE_TESTHIDDEN;
759 759
760 760 ire = ire_ftable_lookup_v4(
761 761 addr, /* dest addr */
762 762 ip_g_all_ones, /* mask */
763 763 0, /* no gateway */
764 764 IRE_BROADCAST,
765 765 ill,
766 766 zoneid,
767 767 NULL,
768 768 match_args,
769 769 0,
770 770 ill->ill_ipst,
771 771 NULL);
772 772 return (ire);
773 773 }
774 774
775 775 /* Arrange to call the specified function for every IRE in the world. */
776 776 void
777 777 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst)
778 778 {
779 779 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst);
780 780 }
781 781
782 782 void
783 783 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
784 784 {
785 785 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst);
786 786 }
787 787
788 788 void
789 789 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
790 790 {
791 791 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst);
792 792 }
793 793
794 794 /*
795 795 * Walk a particular version. version == 0 means both v4 and v6.
796 796 */
797 797 static void
798 798 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
799 799 ip_stack_t *ipst)
800 800 {
801 801 if (vers != IPV6_VERSION) {
802 802 /*
803 803 * ip_forwarding_table variable doesn't matter for IPv4 since
804 804 * ire_walk_ill_tables uses ips_ip_ftable for IPv4.
805 805 */
806 806 ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
807 807 0, NULL,
808 808 NULL, zoneid, ipst);
809 809 }
810 810 if (vers != IPV4_VERSION) {
811 811 ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
812 812 ipst->ips_ip6_ftable_hash_size,
813 813 ipst->ips_ip_forwarding_table_v6,
814 814 NULL, zoneid, ipst);
815 815 }
816 816 }
817 817
818 818 /*
819 819 * Arrange to call the specified function for every IRE that matches the ill.
820 820 */
821 821 void
822 822 ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
823 823 ill_t *ill)
824 824 {
825 825 uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
826 826
827 827 ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill);
828 828 }
829 829
830 830 /*
831 831 * Walk a particular ill and version.
832 832 */
833 833 static void
834 834 ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
835 835 void *arg, uchar_t vers, ill_t *ill)
836 836 {
837 837 ip_stack_t *ipst = ill->ill_ipst;
838 838
839 839 if (vers == IPV4_VERSION) {
840 840 ire_walk_ill_tables(match_flags, ire_type, func, arg,
841 841 IP_MASK_TABLE_SIZE,
842 842 0, NULL,
843 843 ill, ALL_ZONES, ipst);
844 844 }
845 845 if (vers != IPV4_VERSION) {
846 846 ire_walk_ill_tables(match_flags, ire_type, func, arg,
847 847 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
848 848 ipst->ips_ip_forwarding_table_v6,
849 849 ill, ALL_ZONES, ipst);
850 850 }
851 851 }
852 852
853 853 /*
854 854 * Do the specific matching of IREs to shared-IP zones.
855 855 *
856 856 * We have the same logic as in ire_match_args but implemented slightly
857 857 * differently.
858 858 */
859 859 boolean_t
860 860 ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
861 861 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
862 862 {
863 863 ill_t *dst_ill = ire->ire_ill;
864 864
865 865 ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
866 866
867 867 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
868 868 ire->ire_zoneid != ALL_ZONES) {
869 869 /*
870 870 * We're walking the IREs for a specific zone. The only relevant
871 871 * IREs are:
872 872 * - all IREs with a matching ire_zoneid
873 873 * - IRE_IF_ALL IREs for interfaces with a usable source addr
874 874 * with a matching zone
875 875 * - IRE_OFFLINK with a gateway reachable from the zone
876 876 * Note that ealier we only did the IRE_OFFLINK check for
877 877 * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs).
878 878 */
879 879 if (ire->ire_type & IRE_ONLINK) {
880 880 uint_t ifindex;
881 881
882 882 /*
883 883 * Note there is no IRE_INTERFACE on vniN thus
884 884 * can't do an IRE lookup for a matching route.
885 885 */
886 886 ifindex = dst_ill->ill_usesrc_ifindex;
887 887 if (ifindex == 0)
888 888 return (B_FALSE);
889 889
890 890 /*
891 891 * If there is a usable source address in the
892 892 * zone, then it's ok to return an
893 893 * IRE_INTERFACE
894 894 */
895 895 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
896 896 zoneid, ipst)) {
897 897 return (B_FALSE);
898 898 }
899 899 }
900 900 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
901 901 ipif_t *tipif;
902 902
903 903 mutex_enter(&dst_ill->ill_lock);
904 904 for (tipif = dst_ill->ill_ipif;
905 905 tipif != NULL; tipif = tipif->ipif_next) {
906 906 if (!IPIF_IS_CONDEMNED(tipif) &&
907 907 (tipif->ipif_flags & IPIF_UP) &&
908 908 (tipif->ipif_zoneid == zoneid ||
909 909 tipif->ipif_zoneid == ALL_ZONES))
910 910 break;
911 911 }
912 912 mutex_exit(&dst_ill->ill_lock);
913 913 if (tipif == NULL) {
914 914 return (B_FALSE);
915 915 }
916 916 }
917 917 }
918 918 /*
919 919 * Except for ALL_ZONES, we only match the offlink routes
920 920 * where ire_gateway_addr has an IRE_INTERFACE for the zoneid.
921 921 * Since we can have leftover routes after the IP addresses have
922 922 * changed, the global zone will also match offlink routes where the
923 923 * gateway is unreachable from any zone.
924 924 */
925 925 if ((ire->ire_type & IRE_OFFLINK) && zoneid != ALL_ZONES) {
926 926 in6_addr_t gw_addr_v6;
927 927 boolean_t reach;
928 928
929 929 if (ire->ire_ipversion == IPV4_VERSION) {
930 930 reach = ire_gateway_ok_zone_v4(ire->ire_gateway_addr,
931 931 zoneid, dst_ill, NULL, ipst, B_FALSE);
932 932 } else {
933 933 ASSERT(ire->ire_ipversion == IPV6_VERSION);
934 934 mutex_enter(&ire->ire_lock);
935 935 gw_addr_v6 = ire->ire_gateway_addr_v6;
936 936 mutex_exit(&ire->ire_lock);
937 937
938 938 reach = ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid,
939 939 dst_ill, NULL, ipst, B_FALSE);
940 940 }
941 941 if (!reach) {
942 942 if (zoneid != GLOBAL_ZONEID)
943 943 return (B_FALSE);
944 944
945 945 /*
946 946 * Check if ALL_ZONES reachable - if not then let the
947 947 * global zone see it.
948 948 */
949 949 if (ire->ire_ipversion == IPV4_VERSION) {
950 950 reach = ire_gateway_ok_zone_v4(
951 951 ire->ire_gateway_addr, ALL_ZONES,
952 952 dst_ill, NULL, ipst, B_FALSE);
953 953 } else {
954 954 reach = ire_gateway_ok_zone_v6(&gw_addr_v6,
955 955 ALL_ZONES, dst_ill, NULL, ipst, B_FALSE);
956 956 }
957 957 if (reach) {
958 958 /*
959 959 * Some other zone could see it, hence hide it
960 960 * in the global zone.
961 961 */
962 962 return (B_FALSE);
963 963 }
964 964 }
965 965 }
966 966
967 967 if (((!(match_flags & MATCH_IRE_TYPE)) ||
968 968 (ire->ire_type & ire_type)) &&
969 969 ((!(match_flags & MATCH_IRE_ILL)) ||
970 970 (dst_ill == ill ||
971 971 dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) {
972 972 return (B_TRUE);
973 973 }
974 974 return (B_FALSE);
975 975 }
976 976
977 977 int
978 978 rtfunc(struct radix_node *rn, void *arg)
979 979 {
980 980 struct rtfuncarg *rtf = arg;
981 981 struct rt_entry *rt;
982 982 irb_t *irb;
983 983 ire_t *ire;
984 984 boolean_t ret;
985 985
986 986 rt = (struct rt_entry *)rn;
987 987 ASSERT(rt != NULL);
988 988 irb = &rt->rt_irb;
989 989 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
990 990 if ((rtf->rt_match_flags != 0) ||
991 991 (rtf->rt_zoneid != ALL_ZONES)) {
992 992 ret = ire_walk_ill_match(rtf->rt_match_flags,
993 993 rtf->rt_ire_type, ire,
994 994 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
995 995 } else {
996 996 ret = B_TRUE;
997 997 }
998 998 if (ret)
999 999 (*rtf->rt_func)(ire, rtf->rt_arg);
1000 1000 }
1001 1001 return (0);
1002 1002 }
1003 1003
1004 1004 /*
1005 1005 * Walk the ftable entries that match the ill.
1006 1006 */
1007 1007 void
1008 1008 ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
1009 1009 void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
1010 1010 ill_t *ill, zoneid_t zoneid,
1011 1011 ip_stack_t *ipst)
1012 1012 {
1013 1013 irb_t *irb_ptr;
1014 1014 irb_t *irb;
1015 1015 ire_t *ire;
1016 1016 int i, j;
1017 1017 boolean_t ret;
1018 1018 struct rtfuncarg rtfarg;
1019 1019
1020 1020 ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
1021 1021 ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
1022 1022
1023 1023 /* knobs such that routine is called only for v6 case */
1024 1024 if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
1025 1025 for (i = (ftbl_sz - 1); i >= 0; i--) {
1026 1026 if ((irb_ptr = ipftbl[i]) == NULL)
1027 1027 continue;
1028 1028 for (j = 0; j < htbl_sz; j++) {
1029 1029 irb = &irb_ptr[j];
1030 1030 if (irb->irb_ire == NULL)
1031 1031 continue;
1032 1032
1033 1033 irb_refhold(irb);
1034 1034 for (ire = irb->irb_ire; ire != NULL;
1035 1035 ire = ire->ire_next) {
1036 1036 if (match_flags == 0 &&
1037 1037 zoneid == ALL_ZONES) {
1038 1038 ret = B_TRUE;
1039 1039 } else {
1040 1040 ret =
1041 1041 ire_walk_ill_match(
1042 1042 match_flags,
1043 1043 ire_type, ire, ill,
1044 1044 zoneid, ipst);
1045 1045 }
1046 1046 if (ret)
1047 1047 (*func)(ire, arg);
1048 1048 }
1049 1049 irb_refrele(irb);
1050 1050 }
1051 1051 }
1052 1052 } else {
1053 1053 bzero(&rtfarg, sizeof (rtfarg));
1054 1054 rtfarg.rt_func = func;
1055 1055 rtfarg.rt_arg = arg;
1056 1056 if (match_flags != 0) {
1057 1057 rtfarg.rt_match_flags = match_flags;
1058 1058 }
1059 1059 rtfarg.rt_ire_type = ire_type;
1060 1060 rtfarg.rt_ill = ill;
1061 1061 rtfarg.rt_zoneid = zoneid;
1062 1062 rtfarg.rt_ipst = ipst; /* No netstack_hold */
1063 1063 (void) ipst->ips_ip_ftable->rnh_walktree_mt(
1064 1064 ipst->ips_ip_ftable,
1065 1065 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
1066 1066 }
1067 1067 }
1068 1068
1069 1069 /*
1070 1070 * This function takes a mask and returns
1071 1071 * number of bits set in the mask. If no
1072 1072 * bit is set it returns 0.
1073 1073 * Assumes a contiguous mask.
1074 1074 */
1075 1075 int
1076 1076 ip_mask_to_plen(ipaddr_t mask)
1077 1077 {
1078 1078 return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1));
1079 1079 }
1080 1080
1081 1081 /*
1082 1082 * Convert length for a mask to the mask.
1083 1083 */
1084 1084 ipaddr_t
1085 1085 ip_plen_to_mask(uint_t masklen)
1086 1086 {
1087 1087 if (masklen == 0)
1088 1088 return (0);
1089 1089
1090 1090 return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
1091 1091 }
1092 1092
1093 1093 void
1094 1094 ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
1095 1095 {
1096 1096 ill_t *ill;
1097 1097
1098 1098 ill = ire->ire_ill;
1099 1099 if (ill != NULL)
1100 1100 mutex_exit(&ill->ill_lock);
1101 1101 rw_exit(&irb_ptr->irb_lock);
1102 1102 }
1103 1103
1104 1104 /*
1105 1105 * ire_add_v[46] atomically make sure that the ill associated
1106 1106 * with the new ire is not going away i.e., we check ILL_CONDEMNED.
1107 1107 */
1108 1108 int
1109 1109 ire_atomic_start(irb_t *irb_ptr, ire_t *ire)
1110 1110 {
1111 1111 ill_t *ill;
1112 1112
1113 1113 ill = ire->ire_ill;
1114 1114
1115 1115 rw_enter(&irb_ptr->irb_lock, RW_WRITER);
1116 1116 if (ill != NULL) {
1117 1117 mutex_enter(&ill->ill_lock);
1118 1118
1119 1119 /*
1120 1120 * Don't allow IRE's to be created on dying ills, or on
1121 1121 * ill's for which the last ipif is going down, or ones which
1122 1122 * don't have even a single UP interface
1123 1123 */
1124 1124 if ((ill->ill_state_flags &
1125 1125 (ILL_CONDEMNED|ILL_DOWN_IN_PROGRESS)) != 0) {
1126 1126 ire_atomic_end(irb_ptr, ire);
1127 1127 DTRACE_PROBE1(ire__add__on__dying__ill, ire_t *, ire);
1128 1128 return (ENXIO);
1129 1129 }
1130 1130
1131 1131 if (IS_UNDER_IPMP(ill)) {
1132 1132 int error = 0;
1133 1133 mutex_enter(&ill->ill_phyint->phyint_lock);
1134 1134 if (!ipmp_ill_is_active(ill) &&
1135 1135 IRE_HIDDEN_TYPE(ire->ire_type) &&
1136 1136 !ire->ire_testhidden) {
1137 1137 error = EINVAL;
1138 1138 }
1139 1139 mutex_exit(&ill->ill_phyint->phyint_lock);
1140 1140 if (error != 0) {
1141 1141 ire_atomic_end(irb_ptr, ire);
1142 1142 return (error);
1143 1143 }
1144 1144 }
1145 1145
1146 1146 }
1147 1147 return (0);
1148 1148 }
1149 1149
1150 1150 /*
1151 1151 * Add a fully initialized IRE to the forwarding table.
1152 1152 * This returns NULL on failure, or a held IRE on success.
1153 1153 * Normally the returned IRE is the same as the argument. But a different
1154 1154 * IRE will be returned if the added IRE is deemed identical to an existing
1155 1155 * one. In that case ire_identical_ref will be increased.
1156 1156 * The caller always needs to do an ire_refrele() on the returned IRE.
1157 1157 */
1158 1158 ire_t *
1159 1159 ire_add(ire_t *ire)
1160 1160 {
1161 1161 if (IRE_HIDDEN_TYPE(ire->ire_type) &&
1162 1162 ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) {
1163 1163 /*
1164 1164 * IREs hosted on interfaces that are under IPMP
1165 1165 * should be hidden so that applications don't
1166 1166 * accidentally end up sending packets with test
1167 1167 * addresses as their source addresses, or
1168 1168 * sending out interfaces that are e.g. IFF_INACTIVE.
1169 1169 * Hide them here.
1170 1170 */
1171 1171 ire->ire_testhidden = B_TRUE;
1172 1172 }
1173 1173
1174 1174 if (ire->ire_ipversion == IPV6_VERSION)
1175 1175 return (ire_add_v6(ire));
1176 1176 else
1177 1177 return (ire_add_v4(ire));
1178 1178 }
1179 1179
1180 1180 /*
1181 1181 * Add a fully initialized IPv4 IRE to the forwarding table.
1182 1182 * This returns NULL on failure, or a held IRE on success.
1183 1183 * Normally the returned IRE is the same as the argument. But a different
1184 1184 * IRE will be returned if the added IRE is deemed identical to an existing
1185 1185 * one. In that case ire_identical_ref will be increased.
1186 1186 * The caller always needs to do an ire_refrele() on the returned IRE.
1187 1187 */
1188 1188 static ire_t *
1189 1189 ire_add_v4(ire_t *ire)
1190 1190 {
1191 1191 ire_t *ire1;
1192 1192 irb_t *irb_ptr;
1193 1193 ire_t **irep;
1194 1194 int match_flags;
1195 1195 int error;
1196 1196 ip_stack_t *ipst = ire->ire_ipst;
1197 1197
1198 1198 if (ire->ire_ill != NULL)
1199 1199 ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
1200 1200 ASSERT(ire->ire_ipversion == IPV4_VERSION);
1201 1201
1202 1202 /* Make sure the address is properly masked. */
1203 1203 ire->ire_addr &= ire->ire_mask;
1204 1204
1205 1205 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
1206 1206
1207 1207 if (ire->ire_ill != NULL) {
1208 1208 match_flags |= MATCH_IRE_ILL;
1209 1209 }
1210 1210 irb_ptr = ire_get_bucket(ire);
1211 1211 if (irb_ptr == NULL) {
1212 1212 printf("no bucket for %p\n", (void *)ire);
1213 1213 ire_delete(ire);
1214 1214 return (NULL);
1215 1215 }
1216 1216
1217 1217 /*
1218 1218 * Start the atomic add of the ire. Grab the ill lock,
1219 1219 * the bucket lock. Check for condemned.
1220 1220 */
1221 1221 error = ire_atomic_start(irb_ptr, ire);
1222 1222 if (error != 0) {
1223 1223 printf("no ire_atomic_start for %p\n", (void *)ire);
1224 1224 ire_delete(ire);
1225 1225 irb_refrele(irb_ptr);
1226 1226 return (NULL);
1227 1227 }
1228 1228 /*
1229 1229 * If we are creating a hidden IRE, make sure we search for
1230 1230 * hidden IREs when searching for duplicates below.
1231 1231 * Otherwise, we might find an IRE on some other interface
1232 1232 * that's not marked hidden.
1233 1233 */
1234 1234 if (ire->ire_testhidden)
1235 1235 match_flags |= MATCH_IRE_TESTHIDDEN;
1236 1236
1237 1237 /*
1238 1238 * Atomically check for duplicate and insert in the table.
1239 1239 */
1240 1240 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1241 1241 if (IRE_IS_CONDEMNED(ire1))
1242 1242 continue;
1243 1243 /*
1244 1244 * Here we need an exact match on zoneid, i.e.,
1245 1245 * ire_match_args doesn't fit.
1246 1246 */
1247 1247 if (ire1->ire_zoneid != ire->ire_zoneid)
1248 1248 continue;
1249 1249
1250 1250 if (ire1->ire_type != ire->ire_type)
1251 1251 continue;
1252 1252
1253 1253 /*
1254 1254 * Note: We do not allow multiple routes that differ only
1255 1255 * in the gateway security attributes; such routes are
1256 1256 * considered duplicates.
1257 1257 * To change that we explicitly have to treat them as
1258 1258 * different here.
1259 1259 */
1260 1260 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
1261 1261 ire->ire_gateway_addr, ire->ire_type, ire->ire_ill,
1262 1262 ire->ire_zoneid, NULL, match_flags)) {
1263 1263 /*
1264 1264 * Return the old ire after doing a REFHOLD.
↓ open down ↓ |
863 lines elided |
↑ open up ↑ |
1265 1265 * As most of the callers continue to use the IRE
1266 1266 * after adding, we return a held ire. This will
1267 1267 * avoid a lookup in the caller again. If the callers
1268 1268 * don't want to use it, they need to do a REFRELE.
1269 1269 *
1270 1270 * We only allow exactly one IRE_IF_CLONE for any dst,
1271 1271 * so, if the is an IF_CLONE, return the ire without
1272 1272 * an identical_ref, but with an ire_ref held.
1273 1273 */
1274 1274 if (ire->ire_type != IRE_IF_CLONE) {
1275 - atomic_add_32(&ire1->ire_identical_ref, 1);
1275 + atomic_inc_32(&ire1->ire_identical_ref);
1276 1276 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
1277 1277 ire_t *, ire);
1278 1278 }
1279 1279 ire_refhold(ire1);
1280 1280 ire_atomic_end(irb_ptr, ire);
1281 1281 ire_delete(ire);
1282 1282 irb_refrele(irb_ptr);
1283 1283 return (ire1);
1284 1284 }
1285 1285 }
1286 1286
1287 1287 /*
1288 1288 * Normally we do head insertion since most things do not care about
1289 1289 * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add
1290 1290 * assumes we at least do head insertion so that its IRE_BROADCAST
1291 1291 * arrive ahead of existing IRE_HOST for the same address.
1292 1292 * However, due to shared-IP zones (and restrict_interzone_loopback)
1293 1293 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
1294 1294 * address. For that reason we do tail insertion for IRE_IF_CLONE.
1295 1295 * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket,
1296 1296 * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT
1297 1297 * set.
1298 1298 */
1299 1299 irep = (ire_t **)irb_ptr;
1300 1300 if ((ire->ire_type & IRE_IF_CLONE) ||
1301 1301 ((ire->ire_type & IRE_BROADCAST) &&
1302 1302 !(ire->ire_flags & RTF_MULTIRT))) {
1303 1303 while ((ire1 = *irep) != NULL)
1304 1304 irep = &ire1->ire_next;
1305 1305 }
1306 1306 /* Insert at *irep */
1307 1307 ire1 = *irep;
1308 1308 if (ire1 != NULL)
1309 1309 ire1->ire_ptpn = &ire->ire_next;
1310 1310 ire->ire_next = ire1;
1311 1311 /* Link the new one in. */
1312 1312 ire->ire_ptpn = irep;
1313 1313
1314 1314 /*
1315 1315 * ire_walk routines de-reference ire_next without holding
1316 1316 * a lock. Before we point to the new ire, we want to make
1317 1317 * sure the store that sets the ire_next of the new ire
1318 1318 * reaches global visibility, so that ire_walk routines
1319 1319 * don't see a truncated list of ires i.e if the ire_next
1320 1320 * of the new ire gets set after we do "*irep = ire" due
1321 1321 * to re-ordering, the ire_walk thread will see a NULL
1322 1322 * once it accesses the ire_next of the new ire.
1323 1323 * membar_producer() makes sure that the following store
1324 1324 * happens *after* all of the above stores.
1325 1325 */
1326 1326 membar_producer();
1327 1327 *irep = ire;
1328 1328 ire->ire_bucket = irb_ptr;
1329 1329 /*
1330 1330 * We return a bumped up IRE above. Keep it symmetrical
1331 1331 * so that the callers will always have to release. This
1332 1332 * helps the callers of this function because they continue
1333 1333 * to use the IRE after adding and hence they don't have to
1334 1334 * lookup again after we return the IRE.
1335 1335 *
1336 1336 * NOTE : We don't have to use atomics as this is appearing
1337 1337 * in the list for the first time and no one else can bump
1338 1338 * up the reference count on this yet.
1339 1339 */
1340 1340 ire_refhold_locked(ire);
1341 1341 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
1342 1342
1343 1343 irb_ptr->irb_ire_cnt++;
1344 1344 if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC)
1345 1345 irb_ptr->irb_nire++;
1346 1346
1347 1347 if (ire->ire_ill != NULL) {
1348 1348 ire->ire_ill->ill_ire_cnt++;
1349 1349 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */
1350 1350 }
1351 1351
1352 1352 ire_atomic_end(irb_ptr, ire);
1353 1353
1354 1354 /* Make any caching of the IREs be notified or updated */
1355 1355 ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
1356 1356
1357 1357 if (ire->ire_ill != NULL)
1358 1358 ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
1359 1359 irb_refrele(irb_ptr);
1360 1360 return (ire);
1361 1361 }
1362 1362
1363 1363 /*
1364 1364 * irb_refrele is the only caller of the function. ire_unlink calls to
1365 1365 * do the final cleanup for this ire.
1366 1366 */
1367 1367 void
1368 1368 ire_cleanup(ire_t *ire)
1369 1369 {
1370 1370 ire_t *ire_next;
1371 1371 ip_stack_t *ipst = ire->ire_ipst;
1372 1372
1373 1373 ASSERT(ire != NULL);
1374 1374
1375 1375 while (ire != NULL) {
1376 1376 ire_next = ire->ire_next;
1377 1377 if (ire->ire_ipversion == IPV4_VERSION) {
1378 1378 ire_delete_v4(ire);
1379 1379 BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
1380 1380 ire_stats_deleted);
1381 1381 } else {
1382 1382 ASSERT(ire->ire_ipversion == IPV6_VERSION);
1383 1383 ire_delete_v6(ire);
1384 1384 BUMP_IRE_STATS(ipst->ips_ire_stats_v6,
1385 1385 ire_stats_deleted);
1386 1386 }
1387 1387 /*
1388 1388 * Now it's really out of the list. Before doing the
1389 1389 * REFRELE, set ire_next to NULL as ire_inactive asserts
1390 1390 * so.
1391 1391 */
1392 1392 ire->ire_next = NULL;
1393 1393 ire_refrele_notr(ire);
1394 1394 ire = ire_next;
1395 1395 }
1396 1396 }
1397 1397
1398 1398 /*
1399 1399 * irb_refrele is the only caller of the function. It calls to unlink
1400 1400 * all the CONDEMNED ires from this bucket.
1401 1401 */
1402 1402 ire_t *
1403 1403 ire_unlink(irb_t *irb)
1404 1404 {
1405 1405 ire_t *ire;
1406 1406 ire_t *ire1;
1407 1407 ire_t **ptpn;
1408 1408 ire_t *ire_list = NULL;
1409 1409
1410 1410 ASSERT(RW_WRITE_HELD(&irb->irb_lock));
1411 1411 ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) ||
1412 1412 (irb->irb_refcnt == 0));
1413 1413 ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
1414 1414 ASSERT(irb->irb_ire != NULL);
1415 1415
1416 1416 for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
1417 1417 ire1 = ire->ire_next;
1418 1418 if (IRE_IS_CONDEMNED(ire)) {
1419 1419 ptpn = ire->ire_ptpn;
1420 1420 ire1 = ire->ire_next;
1421 1421 if (ire1)
1422 1422 ire1->ire_ptpn = ptpn;
1423 1423 *ptpn = ire1;
1424 1424 ire->ire_ptpn = NULL;
1425 1425 ire->ire_next = NULL;
1426 1426
1427 1427 /*
1428 1428 * We need to call ire_delete_v4 or ire_delete_v6 to
1429 1429 * clean up dependents and the redirects pointing at
1430 1430 * the default gateway. We need to drop the lock
1431 1431 * as ire_flush_cache/ire_delete_host_redircts require
1432 1432 * so. But we can't drop the lock, as ire_unlink needs
1433 1433 * to atomically remove the ires from the list.
1434 1434 * So, create a temporary list of CONDEMNED ires
1435 1435 * for doing ire_delete_v4/ire_delete_v6 operations
1436 1436 * later on.
1437 1437 */
1438 1438 ire->ire_next = ire_list;
1439 1439 ire_list = ire;
1440 1440 }
1441 1441 }
1442 1442 irb->irb_marks &= ~IRB_MARK_CONDEMNED;
1443 1443 return (ire_list);
1444 1444 }
1445 1445
1446 1446 /*
1447 1447 * Clean up the radix node for this ire. Must be called by irb_refrele
1448 1448 * when there are no ire's left in the bucket. Returns TRUE if the bucket
1449 1449 * is deleted and freed.
1450 1450 */
1451 1451 boolean_t
1452 1452 irb_inactive(irb_t *irb)
1453 1453 {
1454 1454 struct rt_entry *rt;
1455 1455 struct radix_node *rn;
1456 1456 ip_stack_t *ipst = irb->irb_ipst;
1457 1457
1458 1458 ASSERT(irb->irb_ipst != NULL);
1459 1459
1460 1460 rt = IRB2RT(irb);
1461 1461 rn = (struct radix_node *)rt;
1462 1462
1463 1463 /* first remove it from the radix tree. */
1464 1464 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
1465 1465 rw_enter(&irb->irb_lock, RW_WRITER);
1466 1466 if (irb->irb_refcnt == 1 && irb->irb_nire == 0) {
1467 1467 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask,
1468 1468 ipst->ips_ip_ftable);
1469 1469 DTRACE_PROBE1(irb__free, rt_t *, rt);
1470 1470 ASSERT((void *)rn == (void *)rt);
1471 1471 Free(rt, rt_entry_cache);
1472 1472 /* irb_lock is freed */
1473 1473 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1474 1474 return (B_TRUE);
1475 1475 }
1476 1476 rw_exit(&irb->irb_lock);
1477 1477 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1478 1478 return (B_FALSE);
1479 1479 }
1480 1480
1481 1481 /*
1482 1482 * Delete the specified IRE.
1483 1483 * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was
1484 1484 * not incremented i.e., that the insertion in the bucket and the increment
1485 1485 * of that counter is done atomically.
1486 1486 */
1487 1487 void
1488 1488 ire_delete(ire_t *ire)
1489 1489 {
1490 1490 ire_t *ire1;
1491 1491 ire_t **ptpn;
1492 1492 irb_t *irb;
1493 1493 ip_stack_t *ipst = ire->ire_ipst;
1494 1494
1495 1495 if ((irb = ire->ire_bucket) == NULL) {
1496 1496 /*
1497 1497 * It was never inserted in the list. Should call REFRELE
1498 1498 * to free this IRE.
1499 1499 */
1500 1500 ire_make_condemned(ire);
1501 1501 ire_refrele_notr(ire);
1502 1502 return;
1503 1503 }
1504 1504
1505 1505 /*
1506 1506 * Move the use counts from an IRE_IF_CLONE to its parent
1507 1507 * IRE_INTERFACE.
1508 1508 * We need to do this before acquiring irb_lock.
1509 1509 */
1510 1510 if (ire->ire_type & IRE_IF_CLONE) {
1511 1511 ire_t *parent;
1512 1512
1513 1513 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
1514 1514 if ((parent = ire->ire_dep_parent) != NULL) {
1515 1515 parent->ire_ob_pkt_count += ire->ire_ob_pkt_count;
1516 1516 parent->ire_ib_pkt_count += ire->ire_ib_pkt_count;
1517 1517 ire->ire_ob_pkt_count = 0;
1518 1518 ire->ire_ib_pkt_count = 0;
1519 1519 }
1520 1520 rw_exit(&ipst->ips_ire_dep_lock);
1521 1521 }
1522 1522
1523 1523 rw_enter(&irb->irb_lock, RW_WRITER);
1524 1524 if (ire->ire_ptpn == NULL) {
1525 1525 /*
↓ open down ↓ |
240 lines elided |
↑ open up ↑ |
1526 1526 * Some other thread has removed us from the list.
1527 1527 * It should have done the REFRELE for us.
1528 1528 */
1529 1529 rw_exit(&irb->irb_lock);
1530 1530 return;
1531 1531 }
1532 1532
1533 1533 if (!IRE_IS_CONDEMNED(ire)) {
1534 1534 /* Is this an IRE representing multiple duplicate entries? */
1535 1535 ASSERT(ire->ire_identical_ref >= 1);
1536 - if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) {
1536 + if (atomic_dec_32_nv(&ire->ire_identical_ref) != 0) {
1537 1537 /* Removed one of the identical parties */
1538 1538 rw_exit(&irb->irb_lock);
1539 1539 return;
1540 1540 }
1541 1541
1542 1542 irb->irb_ire_cnt--;
1543 1543 ire_make_condemned(ire);
1544 1544 }
1545 1545
1546 1546 if (irb->irb_refcnt != 0) {
1547 1547 /*
1548 1548 * The last thread to leave this bucket will
1549 1549 * delete this ire.
1550 1550 */
1551 1551 irb->irb_marks |= IRB_MARK_CONDEMNED;
1552 1552 rw_exit(&irb->irb_lock);
1553 1553 return;
1554 1554 }
1555 1555
1556 1556 /*
1557 1557 * Normally to delete an ire, we walk the bucket. While we
1558 1558 * walk the bucket, we normally bump up irb_refcnt and hence
1559 1559 * we return from above where we mark CONDEMNED and the ire
1560 1560 * gets deleted from ire_unlink. This case is where somebody
1561 1561 * knows the ire e.g by doing a lookup, and wants to delete the
1562 1562 * IRE. irb_refcnt would be 0 in this case if nobody is walking
1563 1563 * the bucket.
1564 1564 */
1565 1565 ptpn = ire->ire_ptpn;
1566 1566 ire1 = ire->ire_next;
1567 1567 if (ire1 != NULL)
1568 1568 ire1->ire_ptpn = ptpn;
1569 1569 ASSERT(ptpn != NULL);
1570 1570 *ptpn = ire1;
1571 1571 ire->ire_ptpn = NULL;
1572 1572 ire->ire_next = NULL;
1573 1573 if (ire->ire_ipversion == IPV6_VERSION) {
1574 1574 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted);
1575 1575 } else {
1576 1576 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
1577 1577 }
1578 1578 rw_exit(&irb->irb_lock);
1579 1579
1580 1580 /* Cleanup dependents and related stuff */
1581 1581 if (ire->ire_ipversion == IPV6_VERSION) {
1582 1582 ire_delete_v6(ire);
1583 1583 } else {
1584 1584 ire_delete_v4(ire);
1585 1585 }
1586 1586 /*
1587 1587 * We removed it from the list. Decrement the
1588 1588 * reference count.
1589 1589 */
1590 1590 ire_refrele_notr(ire);
1591 1591 }
1592 1592
1593 1593 /*
1594 1594 * Delete the specified IRE.
1595 1595 * All calls should use ire_delete().
1596 1596 * Sometimes called as writer though not required by this function.
1597 1597 *
1598 1598 * NOTE : This function is called only if the ire was added
1599 1599 * in the list.
1600 1600 */
1601 1601 static void
1602 1602 ire_delete_v4(ire_t *ire)
1603 1603 {
1604 1604 ip_stack_t *ipst = ire->ire_ipst;
1605 1605
1606 1606 ASSERT(ire->ire_refcnt >= 1);
1607 1607 ASSERT(ire->ire_ipversion == IPV4_VERSION);
1608 1608
1609 1609 ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
1610 1610 if (ire->ire_type == IRE_DEFAULT) {
1611 1611 /*
1612 1612 * when a default gateway is going away
1613 1613 * delete all the host redirects pointing at that
1614 1614 * gateway.
1615 1615 */
1616 1616 ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
1617 1617 }
1618 1618
1619 1619 /*
1620 1620 * If we are deleting an IRE_INTERFACE then we make sure we also
1621 1621 * delete any IRE_IF_CLONE that has been created from it.
1622 1622 * Those are always in ire_dep_children.
1623 1623 */
1624 1624 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL)
1625 1625 ire_dep_delete_if_clone(ire);
1626 1626
1627 1627 /* Remove from parent dependencies and child */
1628 1628 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
1629 1629 if (ire->ire_dep_parent != NULL)
1630 1630 ire_dep_remove(ire);
1631 1631
1632 1632 while (ire->ire_dep_children != NULL)
1633 1633 ire_dep_remove(ire->ire_dep_children);
1634 1634 rw_exit(&ipst->ips_ire_dep_lock);
1635 1635 }
1636 1636
1637 1637 /*
1638 1638 * ire_refrele is the only caller of the function. It calls
1639 1639 * to free the ire when the reference count goes to zero.
1640 1640 */
1641 1641 void
1642 1642 ire_inactive(ire_t *ire)
1643 1643 {
1644 1644 ill_t *ill;
1645 1645 irb_t *irb;
1646 1646 ip_stack_t *ipst = ire->ire_ipst;
1647 1647
1648 1648 ASSERT(ire->ire_refcnt == 0);
1649 1649 ASSERT(ire->ire_ptpn == NULL);
1650 1650 ASSERT(ire->ire_next == NULL);
1651 1651
1652 1652 /* Count how many condemned ires for kmem_cache callback */
1653 1653 ASSERT(IRE_IS_CONDEMNED(ire));
1654 1654 atomic_add_32(&ipst->ips_num_ire_condemned, -1);
1655 1655
1656 1656 if (ire->ire_gw_secattr != NULL) {
1657 1657 ire_gw_secattr_free(ire->ire_gw_secattr);
1658 1658 ire->ire_gw_secattr = NULL;
1659 1659 }
1660 1660
1661 1661 /*
1662 1662 * ire_nce_cache is cleared in ire_delete, and we make sure we don't
1663 1663 * set it once the ire is marked condemned.
1664 1664 */
1665 1665 ASSERT(ire->ire_nce_cache == NULL);
1666 1666
1667 1667 /*
1668 1668 * Since any parent would have a refhold on us they would already
1669 1669 * have been removed.
1670 1670 */
1671 1671 ASSERT(ire->ire_dep_parent == NULL);
1672 1672 ASSERT(ire->ire_dep_sib_next == NULL);
1673 1673 ASSERT(ire->ire_dep_sib_ptpn == NULL);
1674 1674
1675 1675 /*
1676 1676 * Since any children would have a refhold on us they should have
1677 1677 * already been removed.
1678 1678 */
1679 1679 ASSERT(ire->ire_dep_children == NULL);
1680 1680
1681 1681 /*
1682 1682 * ill_ire_ref is increased when the IRE is inserted in the
1683 1683 * bucket - not when the IRE is created.
1684 1684 */
1685 1685 irb = ire->ire_bucket;
1686 1686 ill = ire->ire_ill;
1687 1687 if (irb != NULL && ill != NULL) {
1688 1688 mutex_enter(&ill->ill_lock);
1689 1689 ASSERT(ill->ill_ire_cnt != 0);
1690 1690 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
1691 1691 (char *), "ire", (void *), ire);
1692 1692 ill->ill_ire_cnt--;
1693 1693 if (ILL_DOWN_OK(ill)) {
1694 1694 /* Drops the ill lock */
1695 1695 ipif_ill_refrele_tail(ill);
1696 1696 } else {
1697 1697 mutex_exit(&ill->ill_lock);
1698 1698 }
1699 1699 }
1700 1700 ire->ire_ill = NULL;
1701 1701
1702 1702 /* This should be true for both V4 and V6 */
1703 1703 if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) {
1704 1704 rw_enter(&irb->irb_lock, RW_WRITER);
1705 1705 irb->irb_nire--;
1706 1706 /*
1707 1707 * Instead of examining the conditions for freeing
1708 1708 * the radix node here, we do it by calling
1709 1709 * irb_refrele which is a single point in the code
1710 1710 * that embeds that logic. Bump up the refcnt to
1711 1711 * be able to call irb_refrele
1712 1712 */
1713 1713 irb_refhold_locked(irb);
1714 1714 rw_exit(&irb->irb_lock);
1715 1715 irb_refrele(irb);
1716 1716 }
1717 1717
1718 1718 #ifdef DEBUG
1719 1719 ire_trace_cleanup(ire);
1720 1720 #endif
1721 1721 mutex_destroy(&ire->ire_lock);
1722 1722 if (ire->ire_ipversion == IPV6_VERSION) {
1723 1723 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed);
1724 1724 } else {
1725 1725 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
1726 1726 }
1727 1727 kmem_cache_free(ire_cache, ire);
1728 1728 }
1729 1729
1730 1730 /*
1731 1731 * ire_update_generation is the callback function provided by
1732 1732 * ire_get_bucket() to update the generation number of any
1733 1733 * matching shorter route when a new route is added.
1734 1734 *
1735 1735 * This fucntion always returns a failure return (B_FALSE)
1736 1736 * to force the caller (rn_matchaddr_args)
1737 1737 * to back-track up the tree looking for shorter matches.
1738 1738 */
1739 1739 /* ARGSUSED */
1740 1740 static boolean_t
1741 1741 ire_update_generation(struct radix_node *rn, void *arg)
1742 1742 {
1743 1743 struct rt_entry *rt = (struct rt_entry *)rn;
1744 1744
1745 1745 /* We need to handle all in the same bucket */
1746 1746 irb_increment_generation(&rt->rt_irb);
1747 1747 return (B_FALSE);
1748 1748 }
1749 1749
1750 1750 /*
1751 1751 * Take care of all the generation numbers in the bucket.
1752 1752 */
1753 1753 void
1754 1754 irb_increment_generation(irb_t *irb)
1755 1755 {
1756 1756 ire_t *ire;
1757 1757 ip_stack_t *ipst;
1758 1758
1759 1759 if (irb == NULL || irb->irb_ire_cnt == 0)
1760 1760 return;
1761 1761
1762 1762 ipst = irb->irb_ipst;
1763 1763 /*
1764 1764 * we cannot do an irb_refhold/irb_refrele here as the caller
1765 1765 * already has the global RADIX_NODE_HEAD_WLOCK, and the irb_refrele
1766 1766 * may result in an attempt to free the irb_t, which also needs
1767 1767 * the RADIX_NODE_HEAD lock. However, since we want to traverse the
1768 1768 * irb_ire list without fear of having a condemned ire removed from
1769 1769 * the list, we acquire the irb_lock as WRITER. Moreover, since
1770 1770 * the ire_generation increments are done under the ire_dep_lock,
1771 1771 * acquire the locks in the prescribed lock order first.
1772 1772 */
1773 1773 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
1774 1774 rw_enter(&irb->irb_lock, RW_WRITER);
1775 1775 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
1776 1776 if (!IRE_IS_CONDEMNED(ire))
1777 1777 ire_increment_generation(ire); /* Ourselves */
1778 1778 ire_dep_incr_generation_locked(ire); /* Dependants */
1779 1779 }
1780 1780 rw_exit(&irb->irb_lock);
1781 1781 rw_exit(&ipst->ips_ire_dep_lock);
1782 1782 }
1783 1783
1784 1784 /*
1785 1785 * When an IRE is added or deleted this routine is called to make sure
1786 1786 * any caching of IRE information is notified or updated.
1787 1787 *
1788 1788 * The flag argument indicates if the flush request is due to addition
1789 1789 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
1790 1790 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
1791 1791 */
1792 1792 void
1793 1793 ire_flush_cache_v4(ire_t *ire, int flag)
1794 1794 {
1795 1795 irb_t *irb = ire->ire_bucket;
1796 1796 struct rt_entry *rt = IRB2RT(irb);
1797 1797 ip_stack_t *ipst = ire->ire_ipst;
1798 1798
1799 1799 /*
1800 1800 * IRE_IF_CLONE ire's don't provide any new information
1801 1801 * than the parent from which they are cloned, so don't
1802 1802 * perturb the generation numbers.
1803 1803 */
1804 1804 if (ire->ire_type & IRE_IF_CLONE)
1805 1805 return;
1806 1806
1807 1807 /*
1808 1808 * Ensure that an ire_add during a lookup serializes the updates of the
1809 1809 * generation numbers under the radix head lock so that the lookup gets
1810 1810 * either the old ire and old generation number, or a new ire and new
1811 1811 * generation number.
1812 1812 */
1813 1813 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
1814 1814
1815 1815 /*
1816 1816 * If a route was just added, we need to notify everybody that
1817 1817 * has cached an IRE_NOROUTE since there might now be a better
1818 1818 * route for them.
1819 1819 */
1820 1820 if (flag == IRE_FLUSH_ADD) {
1821 1821 ire_increment_generation(ipst->ips_ire_reject_v4);
1822 1822 ire_increment_generation(ipst->ips_ire_blackhole_v4);
1823 1823 }
1824 1824
1825 1825 /* Adding a default can't otherwise provide a better route */
1826 1826 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
1827 1827 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1828 1828 return;
1829 1829 }
1830 1830
1831 1831 switch (flag) {
1832 1832 case IRE_FLUSH_DELETE:
1833 1833 case IRE_FLUSH_GWCHANGE:
1834 1834 /*
1835 1835 * Update ire_generation for all ire_dep_children chains
1836 1836 * starting with this IRE
1837 1837 */
1838 1838 ire_dep_incr_generation(ire);
1839 1839 break;
1840 1840 case IRE_FLUSH_ADD:
1841 1841 /*
1842 1842 * Update the generation numbers of all shorter matching routes.
1843 1843 * ire_update_generation takes care of the dependants by
1844 1844 * using ire_dep_incr_generation.
1845 1845 */
1846 1846 (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst,
1847 1847 ipst->ips_ip_ftable, ire_update_generation, NULL);
1848 1848 break;
1849 1849 }
1850 1850 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1851 1851 }
1852 1852
1853 1853 /*
1854 1854 * Matches the arguments passed with the values in the ire.
1855 1855 *
1856 1856 * Note: for match types that match using "ill" passed in, ill
1857 1857 * must be checked for non-NULL before calling this routine.
1858 1858 */
1859 1859 boolean_t
1860 1860 ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
1861 1861 int type, const ill_t *ill, zoneid_t zoneid,
1862 1862 const ts_label_t *tsl, int match_flags)
1863 1863 {
1864 1864 ill_t *ire_ill = NULL, *dst_ill;
1865 1865 ip_stack_t *ipst = ire->ire_ipst;
1866 1866
1867 1867 ASSERT(ire->ire_ipversion == IPV4_VERSION);
1868 1868 ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
1869 1869 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
1870 1870 (ill != NULL && !ill->ill_isv6));
1871 1871
1872 1872 /*
1873 1873 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is
1874 1874 * in fact hidden, to ensure the caller gets the right one.
1875 1875 */
1876 1876 if (ire->ire_testhidden) {
1877 1877 if (!(match_flags & MATCH_IRE_TESTHIDDEN))
1878 1878 return (B_FALSE);
1879 1879 }
1880 1880
1881 1881 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
1882 1882 ire->ire_zoneid != ALL_ZONES) {
1883 1883 /*
1884 1884 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
1885 1885 * does not match that of ire_zoneid, a failure to
1886 1886 * match is reported at this point. Otherwise, since some IREs
1887 1887 * that are available in the global zone can be used in local
1888 1888 * zones, additional checks need to be performed:
1889 1889 *
1890 1890 * IRE_LOOPBACK
1891 1891 * entries should never be matched in this situation.
1892 1892 * Each zone has its own IRE_LOOPBACK.
1893 1893 *
1894 1894 * IRE_LOCAL
1895 1895 * We allow them for any zoneid. ire_route_recursive
1896 1896 * does additional checks when
1897 1897 * ip_restrict_interzone_loopback is set.
1898 1898 *
1899 1899 * If ill_usesrc_ifindex is set
1900 1900 * Then we check if the zone has a valid source address
1901 1901 * on the usesrc ill.
1902 1902 *
1903 1903 * If ire_ill is set, then check that the zone has an ipif
1904 1904 * on that ill.
1905 1905 *
1906 1906 * Outside of this function (in ire_round_robin) we check
1907 1907 * that any IRE_OFFLINK has a gateway that reachable from the
1908 1908 * zone when we have multiple choices (ECMP).
1909 1909 */
1910 1910 if (match_flags & MATCH_IRE_ZONEONLY)
1911 1911 return (B_FALSE);
1912 1912 if (ire->ire_type & IRE_LOOPBACK)
1913 1913 return (B_FALSE);
1914 1914
1915 1915 if (ire->ire_type & IRE_LOCAL)
1916 1916 goto matchit;
1917 1917
1918 1918 /*
1919 1919 * The normal case of IRE_ONLINK has a matching zoneid.
1920 1920 * Here we handle the case when shared-IP zones have been
1921 1921 * configured with IP addresses on vniN. In that case it
1922 1922 * is ok for traffic from a zone to use IRE_ONLINK routes
1923 1923 * if the ill has a usesrc pointing at vniN
1924 1924 */
1925 1925 dst_ill = ire->ire_ill;
1926 1926 if (ire->ire_type & IRE_ONLINK) {
1927 1927 uint_t ifindex;
1928 1928
1929 1929 /*
1930 1930 * Note there is no IRE_INTERFACE on vniN thus
1931 1931 * can't do an IRE lookup for a matching route.
1932 1932 */
1933 1933 ifindex = dst_ill->ill_usesrc_ifindex;
1934 1934 if (ifindex == 0)
1935 1935 return (B_FALSE);
1936 1936
1937 1937 /*
1938 1938 * If there is a usable source address in the
1939 1939 * zone, then it's ok to return this IRE_INTERFACE
1940 1940 */
1941 1941 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
1942 1942 zoneid, ipst)) {
1943 1943 ip3dbg(("ire_match_args: no usrsrc for zone"
1944 1944 " dst_ill %p\n", (void *)dst_ill));
1945 1945 return (B_FALSE);
1946 1946 }
1947 1947 }
1948 1948 /*
1949 1949 * For example, with
1950 1950 * route add 11.0.0.0 gw1 -ifp bge0
1951 1951 * route add 11.0.0.0 gw2 -ifp bge1
1952 1952 * this code would differentiate based on
1953 1953 * where the sending zone has addresses.
1954 1954 * Only if the zone has an address on bge0 can it use the first
1955 1955 * route. It isn't clear if this behavior is documented
1956 1956 * anywhere.
1957 1957 */
1958 1958 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
1959 1959 ipif_t *tipif;
1960 1960
1961 1961 mutex_enter(&dst_ill->ill_lock);
1962 1962 for (tipif = dst_ill->ill_ipif;
1963 1963 tipif != NULL; tipif = tipif->ipif_next) {
1964 1964 if (!IPIF_IS_CONDEMNED(tipif) &&
1965 1965 (tipif->ipif_flags & IPIF_UP) &&
1966 1966 (tipif->ipif_zoneid == zoneid ||
1967 1967 tipif->ipif_zoneid == ALL_ZONES))
1968 1968 break;
1969 1969 }
1970 1970 mutex_exit(&dst_ill->ill_lock);
1971 1971 if (tipif == NULL) {
1972 1972 return (B_FALSE);
1973 1973 }
1974 1974 }
1975 1975 }
1976 1976
1977 1977 matchit:
1978 1978 ire_ill = ire->ire_ill;
1979 1979 if (match_flags & MATCH_IRE_ILL) {
1980 1980
1981 1981 /*
1982 1982 * If asked to match an ill, we *must* match
1983 1983 * on the ire_ill for ipmp test addresses, or
1984 1984 * any of the ill in the group for data addresses.
1985 1985 * If we don't, we may as well fail.
1986 1986 * However, we need an exception for IRE_LOCALs to ensure
1987 1987 * we loopback packets even sent to test addresses on different
1988 1988 * interfaces in the group.
1989 1989 */
1990 1990 if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
1991 1991 !(ire->ire_type & IRE_LOCAL)) {
1992 1992 if (ire->ire_ill != ill)
1993 1993 return (B_FALSE);
1994 1994 } else {
1995 1995 match_flags &= ~MATCH_IRE_TESTHIDDEN;
1996 1996 /*
1997 1997 * We know that ill is not NULL, but ire_ill could be
1998 1998 * NULL
1999 1999 */
2000 2000 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
2001 2001 return (B_FALSE);
2002 2002 }
2003 2003 }
2004 2004 if (match_flags & MATCH_IRE_SRC_ILL) {
2005 2005 if (ire_ill == NULL)
2006 2006 return (B_FALSE);
2007 2007 if (!IS_ON_SAME_LAN(ill, ire_ill)) {
2008 2008 if (ire_ill->ill_usesrc_ifindex == 0 ||
2009 2009 (ire_ill->ill_usesrc_ifindex !=
2010 2010 ill->ill_phyint->phyint_ifindex))
2011 2011 return (B_FALSE);
2012 2012 }
2013 2013 }
2014 2014
2015 2015 if ((ire->ire_addr == (addr & mask)) &&
2016 2016 ((!(match_flags & MATCH_IRE_GW)) ||
2017 2017 (ire->ire_gateway_addr == gateway)) &&
2018 2018 ((!(match_flags & MATCH_IRE_DIRECT)) ||
2019 2019 !(ire->ire_flags & RTF_INDIRECT)) &&
2020 2020 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
2021 2021 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
2022 2022 ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) &&
2023 2023 ((!(match_flags & MATCH_IRE_SECATTR)) ||
2024 2024 (!is_system_labeled()) ||
2025 2025 (tsol_ire_match_gwattr(ire, tsl) == 0))) {
2026 2026 /* We found the matched IRE */
2027 2027 return (B_TRUE);
2028 2028 }
2029 2029 return (B_FALSE);
2030 2030 }
2031 2031
2032 2032 /*
2033 2033 * Check if the IRE_LOCAL uses the same ill as another route would use.
2034 2034 * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
2035 2035 * then we don't allow this IRE_LOCAL to be used.
2036 2036 * We always return an IRE; will be RTF_REJECT if no route available.
2037 2037 */
2038 2038 ire_t *
2039 2039 ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl,
2040 2040 const ill_t *ill, uint_t *generationp)
2041 2041 {
2042 2042 ip_stack_t *ipst = ire->ire_ipst;
2043 2043 ire_t *alt_ire;
2044 2044 uint_t ire_type;
2045 2045 uint_t generation;
2046 2046 uint_t match_flags;
2047 2047
2048 2048 ASSERT(ire->ire_type & IRE_LOCAL);
2049 2049 ASSERT(ire->ire_ill != NULL);
2050 2050
2051 2051 /*
2052 2052 * Need to match on everything but local.
2053 2053 * This might result in the creation of a IRE_IF_CLONE for the
2054 2054 * same address as the IRE_LOCAL when restrict_interzone_loopback is
2055 2055 * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted
2056 2056 * to make sure the IRE_LOCAL is always found first.
2057 2057 */
2058 2058 ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK);
2059 2059 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
2060 2060 if (ill != NULL)
2061 2061 match_flags |= MATCH_IRE_ILL;
2062 2062
2063 2063 if (ire->ire_ipversion == IPV4_VERSION) {
2064 2064 alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type,
2065 2065 ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
2066 2066 NULL, &generation);
2067 2067 } else {
2068 2068 alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type,
2069 2069 ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
2070 2070 NULL, &generation);
2071 2071 }
2072 2072 ASSERT(alt_ire != NULL);
2073 2073
2074 2074 if (alt_ire->ire_ill == ire->ire_ill) {
2075 2075 /* Going out the same ILL - ok to send to IRE_LOCAL */
2076 2076 ire_refrele(alt_ire);
2077 2077 } else {
2078 2078 /* Different ill - ignore IRE_LOCAL */
2079 2079 ire_refrele(ire);
2080 2080 ire = alt_ire;
2081 2081 if (generationp != NULL)
2082 2082 *generationp = generation;
2083 2083 }
2084 2084 return (ire);
2085 2085 }
2086 2086
2087 2087 boolean_t
2088 2088 ire_find_zoneid(struct radix_node *rn, void *arg)
2089 2089 {
2090 2090 struct rt_entry *rt = (struct rt_entry *)rn;
2091 2091 irb_t *irb;
2092 2092 ire_t *ire;
2093 2093 ire_ftable_args_t *margs = arg;
2094 2094
2095 2095 ASSERT(rt != NULL);
2096 2096
2097 2097 irb = &rt->rt_irb;
2098 2098
2099 2099 if (irb->irb_ire_cnt == 0)
2100 2100 return (B_FALSE);
2101 2101
2102 2102 rw_enter(&irb->irb_lock, RW_READER);
2103 2103 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
2104 2104 if (IRE_IS_CONDEMNED(ire))
2105 2105 continue;
2106 2106
2107 2107 if (!(ire->ire_type & IRE_INTERFACE))
2108 2108 continue;
2109 2109
2110 2110 if (ire->ire_zoneid != ALL_ZONES &&
2111 2111 ire->ire_zoneid != margs->ift_zoneid)
2112 2112 continue;
2113 2113
2114 2114 if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill)
2115 2115 continue;
2116 2116
2117 2117 if (is_system_labeled() &&
2118 2118 tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0)
2119 2119 continue;
2120 2120
2121 2121 rw_exit(&irb->irb_lock);
2122 2122 return (B_TRUE);
2123 2123 }
2124 2124 rw_exit(&irb->irb_lock);
2125 2125 return (B_FALSE);
2126 2126 }
2127 2127
2128 2128 /*
2129 2129 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
2130 2130 * gateway address. If ill is non-NULL we also match on it.
2131 2131 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
2132 2132 */
2133 2133 boolean_t
2134 2134 ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill,
2135 2135 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
2136 2136 {
2137 2137 struct rt_sockaddr rdst;
2138 2138 struct rt_entry *rt;
2139 2139 ire_ftable_args_t margs;
2140 2140
2141 2141 ASSERT(ill == NULL || !ill->ill_isv6);
2142 2142 if (lock_held)
2143 2143 ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock));
2144 2144 else
2145 2145 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
2146 2146
2147 2147 bzero(&rdst, sizeof (rdst));
2148 2148 rdst.rt_sin_len = sizeof (rdst);
2149 2149 rdst.rt_sin_family = AF_INET;
2150 2150 rdst.rt_sin_addr.s_addr = gateway;
2151 2151
2152 2152 /*
2153 2153 * We only use margs for ill, zoneid, and tsl matching in
2154 2154 * ire_find_zoneid
2155 2155 */
2156 2156 bzero(&margs, sizeof (margs));
2157 2157 margs.ift_ill = ill;
2158 2158 margs.ift_zoneid = zoneid;
2159 2159 margs.ift_tsl = tsl;
2160 2160 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
2161 2161 ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs);
2162 2162
2163 2163 if (!lock_held)
2164 2164 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
2165 2165
2166 2166 return (rt != NULL);
2167 2167 }
2168 2168
2169 2169 /*
2170 2170 * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs.
2171 2171 * The fraction argument tells us what fraction of the IREs to delete.
2172 2172 * Common for IPv4 and IPv6.
2173 2173 * Used when memory backpressure.
2174 2174 */
2175 2175 static void
2176 2176 ire_delete_reclaim(ire_t *ire, char *arg)
2177 2177 {
2178 2178 ip_stack_t *ipst = ire->ire_ipst;
2179 2179 uint_t fraction = *(uint_t *)arg;
2180 2180 uint_t rand;
2181 2181
2182 2182 if ((ire->ire_flags & RTF_DYNAMIC) ||
2183 2183 (ire->ire_type & IRE_IF_CLONE)) {
2184 2184
2185 2185 /* Pick a random number */
2186 2186 rand = (uint_t)ddi_get_lbolt() +
2187 2187 IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256);
2188 2188
2189 2189 /* Use truncation */
2190 2190 if ((rand/fraction)*fraction == rand) {
2191 2191 IP_STAT(ipst, ip_ire_reclaim_deleted);
2192 2192 ire_delete(ire);
2193 2193 }
2194 2194 }
2195 2195
2196 2196 }
2197 2197
2198 2198 /*
2199 2199 * kmem_cache callback to free up memory.
2200 2200 *
2201 2201 * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically
2202 2202 * (RTF_DYNAMIC and IRE_IF_CLONE).
2203 2203 */
2204 2204 static void
2205 2205 ip_ire_reclaim_stack(ip_stack_t *ipst)
2206 2206 {
2207 2207 uint_t fraction = ipst->ips_ip_ire_reclaim_fraction;
2208 2208
2209 2209 IP_STAT(ipst, ip_ire_reclaim_calls);
2210 2210
2211 2211 ire_walk(ire_delete_reclaim, &fraction, ipst);
2212 2212
2213 2213 /*
2214 2214 * Walk all CONNs that can have a reference on an ire, nce or dce.
2215 2215 * Get them to update any stale references to drop any refholds they
2216 2216 * have.
2217 2217 */
2218 2218 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
2219 2219 }
2220 2220
2221 2221 /*
2222 2222 * Called by the memory allocator subsystem directly, when the system
2223 2223 * is running low on memory.
2224 2224 */
2225 2225 /* ARGSUSED */
2226 2226 void
2227 2227 ip_ire_reclaim(void *args)
2228 2228 {
2229 2229 netstack_handle_t nh;
2230 2230 netstack_t *ns;
2231 2231 ip_stack_t *ipst;
2232 2232
2233 2233 netstack_next_init(&nh);
2234 2234 while ((ns = netstack_next(&nh)) != NULL) {
2235 2235 /*
2236 2236 * netstack_next() can return a netstack_t with a NULL
2237 2237 * netstack_ip at boot time.
2238 2238 */
2239 2239 if ((ipst = ns->netstack_ip) == NULL) {
2240 2240 netstack_rele(ns);
2241 2241 continue;
2242 2242 }
2243 2243 ip_ire_reclaim_stack(ipst);
2244 2244 netstack_rele(ns);
2245 2245 }
2246 2246 netstack_next_fini(&nh);
2247 2247 }
2248 2248
2249 2249 static void
2250 2250 power2_roundup(uint32_t *value)
2251 2251 {
2252 2252 int i;
2253 2253
2254 2254 for (i = 1; i < 31; i++) {
2255 2255 if (*value <= (1 << i))
2256 2256 break;
2257 2257 }
2258 2258 *value = (1 << i);
2259 2259 }
2260 2260
2261 2261 /* Global init for all zones */
2262 2262 void
2263 2263 ip_ire_g_init()
2264 2264 {
2265 2265 /*
2266 2266 * Create kmem_caches. ip_ire_reclaim() and ip_nce_reclaim()
2267 2267 * will give disposable IREs back to system when needed.
2268 2268 * This needs to be done here before anything else, since
2269 2269 * ire_add() expects the cache to be created.
2270 2270 */
2271 2271 ire_cache = kmem_cache_create("ire_cache",
2272 2272 sizeof (ire_t), 0, NULL, NULL,
2273 2273 ip_ire_reclaim, NULL, NULL, 0);
2274 2274
2275 2275 ncec_cache = kmem_cache_create("ncec_cache",
2276 2276 sizeof (ncec_t), 0, NULL, NULL,
2277 2277 ip_nce_reclaim, NULL, NULL, 0);
2278 2278 nce_cache = kmem_cache_create("nce_cache",
2279 2279 sizeof (nce_t), 0, NULL, NULL,
2280 2280 NULL, NULL, NULL, 0);
2281 2281
2282 2282 rt_entry_cache = kmem_cache_create("rt_entry",
2283 2283 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
2284 2284
2285 2285 /*
2286 2286 * Have radix code setup kmem caches etc.
2287 2287 */
2288 2288 rn_init();
2289 2289 }
2290 2290
2291 2291 void
2292 2292 ip_ire_init(ip_stack_t *ipst)
2293 2293 {
2294 2294 ire_t *ire;
2295 2295 int error;
2296 2296
2297 2297 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
2298 2298
2299 2299 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
2300 2300
2301 2301 /*
2302 2302 * Make sure that the forwarding table size is a power of 2.
2303 2303 * The IRE*_ADDR_HASH() macroes depend on that.
2304 2304 */
2305 2305 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
2306 2306 power2_roundup(&ipst->ips_ip6_ftable_hash_size);
2307 2307
2308 2308 /*
2309 2309 * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6.
2310 2310 * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has
2311 2311 * RTF_BLACKHOLE set. We use the latter for transient errors such
2312 2312 * as memory allocation failures and tripping on IRE_IS_CONDEMNED
2313 2313 * entries.
2314 2314 */
2315 2315 ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2316 2316 *ire = ire_null;
2317 2317 error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2318 2318 RTF_REJECT|RTF_UP, NULL, ipst);
2319 2319 ASSERT(error == 0);
2320 2320 ipst->ips_ire_reject_v4 = ire;
2321 2321
2322 2322 ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2323 2323 *ire = ire_null;
2324 2324 error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2325 2325 RTF_REJECT|RTF_UP, NULL, ipst);
2326 2326 ASSERT(error == 0);
2327 2327 ipst->ips_ire_reject_v6 = ire;
2328 2328
2329 2329 ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2330 2330 *ire = ire_null;
2331 2331 error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2332 2332 RTF_BLACKHOLE|RTF_UP, NULL, ipst);
2333 2333 ASSERT(error == 0);
2334 2334 ipst->ips_ire_blackhole_v4 = ire;
2335 2335
2336 2336 ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
2337 2337 *ire = ire_null;
2338 2338 error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
2339 2339 RTF_BLACKHOLE|RTF_UP, NULL, ipst);
2340 2340 ASSERT(error == 0);
2341 2341 ipst->ips_ire_blackhole_v6 = ire;
2342 2342
2343 2343 rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL);
2344 2344 rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL);
2345 2345 }
2346 2346
2347 2347 void
2348 2348 ip_ire_g_fini(void)
2349 2349 {
2350 2350 kmem_cache_destroy(ire_cache);
2351 2351 kmem_cache_destroy(ncec_cache);
2352 2352 kmem_cache_destroy(nce_cache);
2353 2353 kmem_cache_destroy(rt_entry_cache);
2354 2354
2355 2355 rn_fini();
2356 2356 }
2357 2357
2358 2358 void
2359 2359 ip_ire_fini(ip_stack_t *ipst)
2360 2360 {
2361 2361 int i;
2362 2362
2363 2363 ire_make_condemned(ipst->ips_ire_reject_v6);
2364 2364 ire_refrele_notr(ipst->ips_ire_reject_v6);
2365 2365 ipst->ips_ire_reject_v6 = NULL;
2366 2366
2367 2367 ire_make_condemned(ipst->ips_ire_reject_v4);
2368 2368 ire_refrele_notr(ipst->ips_ire_reject_v4);
2369 2369 ipst->ips_ire_reject_v4 = NULL;
2370 2370
2371 2371 ire_make_condemned(ipst->ips_ire_blackhole_v6);
2372 2372 ire_refrele_notr(ipst->ips_ire_blackhole_v6);
2373 2373 ipst->ips_ire_blackhole_v6 = NULL;
2374 2374
2375 2375 ire_make_condemned(ipst->ips_ire_blackhole_v4);
2376 2376 ire_refrele_notr(ipst->ips_ire_blackhole_v4);
2377 2377 ipst->ips_ire_blackhole_v4 = NULL;
2378 2378
2379 2379 /*
2380 2380 * Delete all IREs - assumes that the ill/ipifs have
2381 2381 * been removed so what remains are just the ftable to handle.
2382 2382 */
2383 2383 ire_walk(ire_delete, NULL, ipst);
2384 2384
2385 2385 rn_freehead(ipst->ips_ip_ftable);
2386 2386 ipst->ips_ip_ftable = NULL;
2387 2387
2388 2388 rw_destroy(&ipst->ips_ire_dep_lock);
2389 2389 rw_destroy(&ipst->ips_ip6_ire_head_lock);
2390 2390
2391 2391 mutex_destroy(&ipst->ips_ire_ft_init_lock);
2392 2392
2393 2393 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
2394 2394 irb_t *ptr;
2395 2395 int j;
2396 2396
2397 2397 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL)
2398 2398 continue;
2399 2399
2400 2400 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
2401 2401 ASSERT(ptr[j].irb_ire == NULL);
2402 2402 rw_destroy(&ptr[j].irb_lock);
2403 2403 }
2404 2404 mi_free(ptr);
2405 2405 ipst->ips_ip_forwarding_table_v6[i] = NULL;
2406 2406 }
2407 2407 }
2408 2408
2409 2409 #ifdef DEBUG
2410 2410 void
2411 2411 ire_trace_ref(ire_t *ire)
2412 2412 {
2413 2413 mutex_enter(&ire->ire_lock);
2414 2414 if (ire->ire_trace_disable) {
2415 2415 mutex_exit(&ire->ire_lock);
2416 2416 return;
2417 2417 }
2418 2418
2419 2419 if (th_trace_ref(ire, ire->ire_ipst)) {
2420 2420 mutex_exit(&ire->ire_lock);
2421 2421 } else {
2422 2422 ire->ire_trace_disable = B_TRUE;
2423 2423 mutex_exit(&ire->ire_lock);
2424 2424 ire_trace_cleanup(ire);
2425 2425 }
2426 2426 }
2427 2427
2428 2428 void
2429 2429 ire_untrace_ref(ire_t *ire)
2430 2430 {
2431 2431 mutex_enter(&ire->ire_lock);
2432 2432 if (!ire->ire_trace_disable)
2433 2433 th_trace_unref(ire);
2434 2434 mutex_exit(&ire->ire_lock);
2435 2435 }
2436 2436
2437 2437 static void
2438 2438 ire_trace_cleanup(const ire_t *ire)
2439 2439 {
2440 2440 th_trace_cleanup(ire, ire->ire_trace_disable);
2441 2441 }
2442 2442 #endif /* DEBUG */
2443 2443
2444 2444 /*
2445 2445 * Find, or create if needed, the nce_t pointer to the neighbor cache
2446 2446 * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t
2447 2447 * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or
2448 2448 * on the next available under-ill (selected by the IPMP rotor) in the
2449 2449 * unicast IPMP case.
2450 2450 *
2451 2451 * If a neighbor-cache entry has to be created (i.e., one does not already
2452 2452 * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache
2453 2453 * entry are initialized in nce_add_v4(). The broadcast, multicast, and
2454 2454 * link-layer type determine the contents of {ncec_state, ncec_lladdr} of
2455 2455 * the ncec_t created. The ncec_lladdr is non-null for all link types with
2456 2456 * non-zero ill_phys_addr_length, though the contents may be zero in cases
2457 2457 * where the link-layer type is not known at the time of creation
2458 2458 * (e.g., IRE_IFRESOLVER links)
2459 2459 *
2460 2460 * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr
2461 2461 * has the physical broadcast address of the outgoing interface.
2462 2462 * For unicast ire entries,
2463 2463 * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
2464 2464 * ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state.
2465 2465 * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
2466 2466 * layer resolution is necessary, so that the ncec_t will be in the
2467 2467 * ND_REACHABLE state
2468 2468 *
2469 2469 * The link layer information needed for broadcast addresses, and for
2470 2470 * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
2471 2471 * never needs re-verification for the lifetime of the ncec_t. These are
2472 2472 * therefore marked NCE_F_NONUD.
2473 2473 *
2474 2474 * The nce returned will be created such that the nce_ill == ill that
2475 2475 * is passed in. Note that the nce itself may not have ncec_ill == ill
2476 2476 * where IPMP links are involved.
2477 2477 */
2478 2478 static nce_t *
2479 2479 ire_nce_init(ill_t *ill, const void *addr, int ire_type)
2480 2480 {
2481 2481 int err;
2482 2482 nce_t *nce = NULL;
2483 2483 uint16_t ncec_flags;
2484 2484 uchar_t *hwaddr;
2485 2485 boolean_t need_refrele = B_FALSE;
2486 2486 ill_t *in_ill = ill;
2487 2487 boolean_t is_unicast;
2488 2488 uint_t hwaddr_len;
2489 2489
2490 2490 is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0);
2491 2491 if (IS_IPMP(ill) ||
2492 2492 ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) {
2493 2493 if ((ill = ipmp_ill_hold_xmit_ill(ill, is_unicast)) == NULL)
2494 2494 return (NULL);
2495 2495 need_refrele = B_TRUE;
2496 2496 }
2497 2497 ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
2498 2498
2499 2499 switch (ire_type) {
2500 2500 case IRE_BROADCAST:
2501 2501 ASSERT(!ill->ill_isv6);
2502 2502 ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD);
2503 2503 break;
2504 2504 case IRE_MULTICAST:
2505 2505 ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD);
2506 2506 break;
2507 2507 }
2508 2508
2509 2509 if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) {
2510 2510 hwaddr = ill->ill_dest_addr;
2511 2511 } else {
2512 2512 hwaddr = NULL;
2513 2513 }
2514 2514 hwaddr_len = ill->ill_phys_addr_length;
2515 2515
2516 2516 retry:
2517 2517 /* nce_state will be computed by nce_add_common() */
2518 2518 if (!ill->ill_isv6) {
2519 2519 err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr,
2520 2520 ncec_flags, ND_UNCHANGED, &nce);
2521 2521 } else {
2522 2522 err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr,
2523 2523 ncec_flags, ND_UNCHANGED, &nce);
2524 2524 }
2525 2525
2526 2526 switch (err) {
2527 2527 case 0:
2528 2528 break;
2529 2529 case EEXIST:
2530 2530 /*
2531 2531 * When subnets change or partially overlap what was once
2532 2532 * a broadcast address could now be a unicast, or vice versa.
2533 2533 */
2534 2534 if (((ncec_flags ^ nce->nce_common->ncec_flags) &
2535 2535 NCE_F_BCAST) != 0) {
2536 2536 ASSERT(!ill->ill_isv6);
2537 2537 ncec_delete(nce->nce_common);
2538 2538 nce_refrele(nce);
2539 2539 goto retry;
2540 2540 }
2541 2541 break;
2542 2542 default:
2543 2543 DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err);
2544 2544 if (need_refrele)
2545 2545 ill_refrele(ill);
2546 2546 return (NULL);
2547 2547 }
2548 2548 /*
2549 2549 * If the ill was an under-ill of an IPMP group, we need to verify
2550 2550 * that it is still active so that we select an active interface in
2551 2551 * the group. However, since ipmp_ill_is_active ASSERTs for
2552 2552 * IS_UNDER_IPMP(), we first need to verify that the ill is an
2553 2553 * under-ill, and since this is being done in the data path, the
2554 2554 * only way to ascertain this is by holding the ill_g_lock.
2555 2555 */
2556 2556 rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER);
2557 2557 mutex_enter(&ill->ill_lock);
2558 2558 mutex_enter(&ill->ill_phyint->phyint_lock);
2559 2559 if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
2560 2560 /*
2561 2561 * need_refrele implies that the under ill was selected by
2562 2562 * ipmp_ill_hold_xmit_ill() because either the in_ill was an
2563 2563 * ipmp_ill, or we are sending a non-unicast packet on an
2564 2564 * under_ill. However, when we get here, the ill selected by
2565 2565 * ipmp_ill_hold_xmit_ill was pulled out of the active set
2566 2566 * (for unicast) or cast_ill nomination (for !unicast) after
2567 2567 * it was picked as the outgoing ill. We have to pick an
2568 2568 * active interface and/or cast_ill in the group.
2569 2569 */
2570 2570 mutex_exit(&ill->ill_phyint->phyint_lock);
2571 2571 nce_delete(nce);
2572 2572 mutex_exit(&ill->ill_lock);
2573 2573 rw_exit(&ill->ill_ipst->ips_ill_g_lock);
2574 2574 nce_refrele(nce);
2575 2575 ill_refrele(ill);
2576 2576 if ((ill = ipmp_ill_hold_xmit_ill(in_ill, is_unicast)) == NULL)
2577 2577 return (NULL);
2578 2578 goto retry;
2579 2579 } else {
2580 2580 mutex_exit(&ill->ill_phyint->phyint_lock);
2581 2581 mutex_exit(&ill->ill_lock);
2582 2582 rw_exit(&ill->ill_ipst->ips_ill_g_lock);
2583 2583 }
2584 2584 done:
2585 2585 ASSERT(nce->nce_ill == ill);
2586 2586 if (need_refrele)
2587 2587 ill_refrele(ill);
2588 2588 return (nce);
2589 2589 }
2590 2590
2591 2591 nce_t *
2592 2592 arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type)
2593 2593 {
2594 2594 return (ire_nce_init(ill, &addr4, ire_type));
2595 2595 }
2596 2596
2597 2597 nce_t *
2598 2598 ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type)
2599 2599 {
2600 2600 ASSERT((ire_type & IRE_BROADCAST) == 0);
2601 2601 return (ire_nce_init(ill, addr6, ire_type));
2602 2602 }
2603 2603
2604 2604 /*
2605 2605 * The caller should hold irb_lock as a writer if the ire is in a bucket.
2606 2606 * This routine will clear ire_nce_cache, and we make sure that we can never
2607 2607 * set ire_nce_cache after the ire is marked condemned.
2608 2608 */
2609 2609 void
2610 2610 ire_make_condemned(ire_t *ire)
↓ open down ↓ |
1064 lines elided |
↑ open up ↑ |
2611 2611 {
2612 2612 ip_stack_t *ipst = ire->ire_ipst;
2613 2613 nce_t *nce;
2614 2614
2615 2615 mutex_enter(&ire->ire_lock);
2616 2616 ASSERT(ire->ire_bucket == NULL ||
2617 2617 RW_WRITE_HELD(&ire->ire_bucket->irb_lock));
2618 2618 ASSERT(!IRE_IS_CONDEMNED(ire));
2619 2619 ire->ire_generation = IRE_GENERATION_CONDEMNED;
2620 2620 /* Count how many condemned ires for kmem_cache callback */
2621 - atomic_add_32(&ipst->ips_num_ire_condemned, 1);
2621 + atomic_inc_32(&ipst->ips_num_ire_condemned);
2622 2622 nce = ire->ire_nce_cache;
2623 2623 ire->ire_nce_cache = NULL;
2624 2624 mutex_exit(&ire->ire_lock);
2625 2625 if (nce != NULL)
2626 2626 nce_refrele(nce);
2627 2627 }
2628 2628
2629 2629 /*
2630 2630 * Increment the generation avoiding the special condemned value
2631 2631 */
2632 2632 void
2633 2633 ire_increment_generation(ire_t *ire)
2634 2634 {
2635 2635 uint_t generation;
2636 2636
2637 2637 mutex_enter(&ire->ire_lock);
2638 2638 /*
2639 2639 * Even though the caller has a hold it can't prevent a concurrent
2640 2640 * ire_delete marking the IRE condemned
2641 2641 */
2642 2642 if (!IRE_IS_CONDEMNED(ire)) {
2643 2643 generation = ire->ire_generation + 1;
2644 2644 if (generation == IRE_GENERATION_CONDEMNED)
2645 2645 generation = IRE_GENERATION_INITIAL;
2646 2646 ASSERT(generation != IRE_GENERATION_VERIFY);
2647 2647 ire->ire_generation = generation;
2648 2648 }
2649 2649 mutex_exit(&ire->ire_lock);
2650 2650 }
2651 2651
2652 2652 /*
2653 2653 * Increment ire_generation on all the IRE_MULTICASTs
2654 2654 * Used when the default multicast interface (as determined by
2655 2655 * ill_lookup_multicast) might have changed.
2656 2656 *
2657 2657 * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and
2658 2658 * ill unplumb.
2659 2659 */
2660 2660 void
2661 2661 ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6)
2662 2662 {
2663 2663 ill_t *ill;
2664 2664 ill_walk_context_t ctx;
2665 2665
2666 2666 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2667 2667 if (isv6)
2668 2668 ill = ILL_START_WALK_V6(&ctx, ipst);
2669 2669 else
2670 2670 ill = ILL_START_WALK_V4(&ctx, ipst);
2671 2671 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
2672 2672 if (ILL_IS_CONDEMNED(ill))
2673 2673 continue;
2674 2674 if (ill->ill_ire_multicast != NULL)
2675 2675 ire_increment_generation(ill->ill_ire_multicast);
2676 2676 }
2677 2677 rw_exit(&ipst->ips_ill_g_lock);
2678 2678 }
2679 2679
2680 2680 /*
2681 2681 * Return a held IRE_NOROUTE with RTF_REJECT set
2682 2682 */
2683 2683 ire_t *
2684 2684 ire_reject(ip_stack_t *ipst, boolean_t isv6)
2685 2685 {
2686 2686 ire_t *ire;
2687 2687
2688 2688 if (isv6)
2689 2689 ire = ipst->ips_ire_reject_v6;
2690 2690 else
2691 2691 ire = ipst->ips_ire_reject_v4;
2692 2692
2693 2693 ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
2694 2694 ire_refhold(ire);
2695 2695 return (ire);
2696 2696 }
2697 2697
2698 2698 /*
2699 2699 * Return a held IRE_NOROUTE with RTF_BLACKHOLE set
2700 2700 */
2701 2701 ire_t *
2702 2702 ire_blackhole(ip_stack_t *ipst, boolean_t isv6)
2703 2703 {
2704 2704 ire_t *ire;
2705 2705
2706 2706 if (isv6)
2707 2707 ire = ipst->ips_ire_blackhole_v6;
2708 2708 else
2709 2709 ire = ipst->ips_ire_blackhole_v4;
2710 2710
2711 2711 ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
2712 2712 ire_refhold(ire);
2713 2713 return (ire);
2714 2714 }
2715 2715
2716 2716 /*
2717 2717 * Return a held IRE_MULTICAST.
2718 2718 */
2719 2719 ire_t *
2720 2720 ire_multicast(ill_t *ill)
2721 2721 {
2722 2722 ire_t *ire = ill->ill_ire_multicast;
2723 2723
2724 2724 ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED);
2725 2725 if (ire == NULL)
2726 2726 ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6);
2727 2727 else
2728 2728 ire_refhold(ire);
2729 2729 return (ire);
2730 2730 }
2731 2731
2732 2732 /*
2733 2733 * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK
2734 2734 * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6).
2735 2735 * This can return an RTF_REJECT|RTF_BLACKHOLE.
2736 2736 * The returned IRE is held.
2737 2737 * The assumption is that ip_select_route() has been called and returned the
2738 2738 * IRE (thus ip_select_route would have set up the ire_dep* information.)
2739 2739 * If some IRE is deleteted then ire_dep_remove() will have been called and
2740 2740 * we might not find a nexthop IRE, in which case we return NULL.
2741 2741 */
2742 2742 ire_t *
2743 2743 ire_nexthop(ire_t *ire)
2744 2744 {
2745 2745 ip_stack_t *ipst = ire->ire_ipst;
2746 2746
2747 2747 /* Acquire lock to walk ire_dep_parent */
2748 2748 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
2749 2749 while (ire != NULL) {
2750 2750 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2751 2751 goto done;
2752 2752 }
2753 2753 /*
2754 2754 * If we find an IRE_ONLINK we are done. This includes
2755 2755 * the case of IRE_MULTICAST.
2756 2756 * Note that in order to send packets we need a host-specific
2757 2757 * IRE_IF_ALL first in the ire_dep_parent chain. Normally this
2758 2758 * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE
2759 2759 * was not host specific.
2760 2760 * However, ip_rts_request doesn't want to send packets
2761 2761 * hence doesn't want to allocate an IRE_IF_CLONE. Yet
2762 2762 * it needs an IRE_IF_ALL to get to the ill. Thus
2763 2763 * we return IRE_IF_ALL that are not host specific here.
2764 2764 */
2765 2765 if (ire->ire_type & IRE_ONLINK)
2766 2766 goto done;
2767 2767 ire = ire->ire_dep_parent;
2768 2768 }
2769 2769 rw_exit(&ipst->ips_ire_dep_lock);
2770 2770 return (NULL);
2771 2771
2772 2772 done:
2773 2773 ire_refhold(ire);
2774 2774 rw_exit(&ipst->ips_ire_dep_lock);
2775 2775 return (ire);
2776 2776 }
2777 2777
2778 2778 /*
2779 2779 * Find the ill used to send packets. This will be NULL in case
2780 2780 * of a reject or blackhole.
2781 2781 * The returned ill is held; caller needs to do ill_refrele when done.
2782 2782 */
2783 2783 ill_t *
2784 2784 ire_nexthop_ill(ire_t *ire)
2785 2785 {
2786 2786 ill_t *ill;
2787 2787
2788 2788 ire = ire_nexthop(ire);
2789 2789 if (ire == NULL)
2790 2790 return (NULL);
2791 2791
2792 2792 /* ire_ill can not change for an existing ire */
2793 2793 ill = ire->ire_ill;
2794 2794 if (ill != NULL)
2795 2795 ill_refhold(ill);
2796 2796 ire_refrele(ire);
2797 2797 return (ill);
2798 2798 }
2799 2799
2800 2800 #ifdef DEBUG
2801 2801 static boolean_t
2802 2802 parent_has_child(ire_t *parent, ire_t *child)
2803 2803 {
2804 2804 ire_t *ire;
2805 2805 ire_t *prev;
2806 2806
2807 2807 ire = parent->ire_dep_children;
2808 2808 prev = NULL;
2809 2809 while (ire != NULL) {
2810 2810 if (prev == NULL) {
2811 2811 ASSERT(ire->ire_dep_sib_ptpn ==
2812 2812 &(parent->ire_dep_children));
2813 2813 } else {
2814 2814 ASSERT(ire->ire_dep_sib_ptpn ==
2815 2815 &(prev->ire_dep_sib_next));
2816 2816 }
2817 2817 if (ire == child)
2818 2818 return (B_TRUE);
2819 2819 prev = ire;
2820 2820 ire = ire->ire_dep_sib_next;
2821 2821 }
2822 2822 return (B_FALSE);
2823 2823 }
2824 2824
2825 2825 static void
2826 2826 ire_dep_verify(ire_t *ire)
2827 2827 {
2828 2828 ire_t *parent = ire->ire_dep_parent;
2829 2829 ire_t *child = ire->ire_dep_children;
2830 2830
2831 2831 ASSERT(ire->ire_ipversion == IPV4_VERSION ||
2832 2832 ire->ire_ipversion == IPV6_VERSION);
2833 2833 if (parent != NULL) {
2834 2834 ASSERT(parent->ire_ipversion == IPV4_VERSION ||
2835 2835 parent->ire_ipversion == IPV6_VERSION);
2836 2836 ASSERT(parent->ire_refcnt >= 1);
2837 2837 ASSERT(parent_has_child(parent, ire));
2838 2838 }
2839 2839 if (child != NULL) {
2840 2840 ASSERT(child->ire_ipversion == IPV4_VERSION ||
2841 2841 child->ire_ipversion == IPV6_VERSION);
2842 2842 ASSERT(child->ire_dep_parent == ire);
2843 2843 ASSERT(child->ire_dep_sib_ptpn != NULL);
2844 2844 ASSERT(parent_has_child(ire, child));
2845 2845 }
2846 2846 }
2847 2847 #endif /* DEBUG */
2848 2848
2849 2849 /*
2850 2850 * Assumes ire_dep_parent is set. Remove this child from its parent's linkage.
2851 2851 */
2852 2852 void
2853 2853 ire_dep_remove(ire_t *ire)
2854 2854 {
2855 2855 ip_stack_t *ipst = ire->ire_ipst;
2856 2856 ire_t *parent = ire->ire_dep_parent;
2857 2857 ire_t *next;
2858 2858 nce_t *nce;
2859 2859
2860 2860 ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
2861 2861 ASSERT(ire->ire_dep_parent != NULL);
2862 2862 ASSERT(ire->ire_dep_sib_ptpn != NULL);
2863 2863
2864 2864 #ifdef DEBUG
2865 2865 ire_dep_verify(ire);
2866 2866 ire_dep_verify(parent);
2867 2867 #endif
2868 2868
2869 2869 next = ire->ire_dep_sib_next;
2870 2870 if (next != NULL)
2871 2871 next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn;
2872 2872
2873 2873 ASSERT(*(ire->ire_dep_sib_ptpn) == ire);
2874 2874 *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next;
2875 2875
2876 2876 ire->ire_dep_sib_ptpn = NULL;
2877 2877 ire->ire_dep_sib_next = NULL;
2878 2878
2879 2879 mutex_enter(&ire->ire_lock);
2880 2880 parent = ire->ire_dep_parent;
2881 2881 ire->ire_dep_parent = NULL;
2882 2882 mutex_exit(&ire->ire_lock);
2883 2883
2884 2884 /*
2885 2885 * Make sure all our children, grandchildren, etc set
2886 2886 * ire_dep_parent_generation to IRE_GENERATION_VERIFY since
2887 2887 * we can no longer guarantee than the children have a current
2888 2888 * ire_nce_cache and ire_nexthop_ill().
2889 2889 */
2890 2890 if (ire->ire_dep_children != NULL)
2891 2891 ire_dep_invalidate_children(ire->ire_dep_children);
2892 2892
2893 2893 /*
2894 2894 * Since the parent is gone we make sure we clear ire_nce_cache.
2895 2895 * We can clear it under ire_lock even if the IRE is used
2896 2896 */
2897 2897 mutex_enter(&ire->ire_lock);
2898 2898 nce = ire->ire_nce_cache;
2899 2899 ire->ire_nce_cache = NULL;
2900 2900 mutex_exit(&ire->ire_lock);
2901 2901 if (nce != NULL)
2902 2902 nce_refrele(nce);
2903 2903
2904 2904 #ifdef DEBUG
2905 2905 ire_dep_verify(ire);
2906 2906 ire_dep_verify(parent);
2907 2907 #endif
2908 2908
2909 2909 ire_refrele_notr(parent);
2910 2910 ire_refrele_notr(ire);
2911 2911 }
2912 2912
2913 2913 /*
2914 2914 * Insert the child in the linkage of the parent
2915 2915 */
2916 2916 static void
2917 2917 ire_dep_parent_insert(ire_t *child, ire_t *parent)
2918 2918 {
2919 2919 ip_stack_t *ipst = child->ire_ipst;
2920 2920 ire_t *next;
2921 2921
2922 2922 ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
2923 2923 ASSERT(child->ire_dep_parent == NULL);
2924 2924
2925 2925 #ifdef DEBUG
2926 2926 ire_dep_verify(child);
2927 2927 ire_dep_verify(parent);
2928 2928 #endif
2929 2929 /* No parents => no siblings */
2930 2930 ASSERT(child->ire_dep_sib_ptpn == NULL);
2931 2931 ASSERT(child->ire_dep_sib_next == NULL);
2932 2932
2933 2933 ire_refhold_notr(parent);
2934 2934 ire_refhold_notr(child);
2935 2935
2936 2936 /* Head insertion */
2937 2937 next = parent->ire_dep_children;
2938 2938 if (next != NULL) {
2939 2939 ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children));
2940 2940 child->ire_dep_sib_next = next;
2941 2941 next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next);
2942 2942 }
2943 2943 parent->ire_dep_children = child;
2944 2944 child->ire_dep_sib_ptpn = &(parent->ire_dep_children);
2945 2945
2946 2946 mutex_enter(&child->ire_lock);
2947 2947 child->ire_dep_parent = parent;
2948 2948 mutex_exit(&child->ire_lock);
2949 2949
2950 2950 #ifdef DEBUG
2951 2951 ire_dep_verify(child);
2952 2952 ire_dep_verify(parent);
2953 2953 #endif
2954 2954 }
2955 2955
2956 2956
2957 2957 /*
2958 2958 * Given count worth of ires and generations, build ire_dep_* relationships
2959 2959 * from ires[0] to ires[count-1]. Record generations[i+1] in
2960 2960 * ire_dep_parent_generation for ires[i].
2961 2961 * We graft onto an existing parent chain by making sure that we don't
2962 2962 * touch ire_dep_parent for ires[count-1].
2963 2963 *
2964 2964 * We check for any condemned ire_generation count and return B_FALSE in
2965 2965 * that case so that the caller can tear it apart.
2966 2966 *
2967 2967 * Note that generations[0] is not used. Caller handles that.
2968 2968 */
2969 2969 boolean_t
2970 2970 ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count)
2971 2971 {
2972 2972 ire_t *ire = ires[0];
2973 2973 ip_stack_t *ipst;
2974 2974 uint_t i;
2975 2975
2976 2976 ASSERT(count > 0);
2977 2977 if (count == 1) {
2978 2978 /* No work to do */
2979 2979 return (B_TRUE);
2980 2980 }
2981 2981 ipst = ire->ire_ipst;
2982 2982 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
2983 2983 /*
2984 2984 * Do not remove the linkage for any existing parent chain i.e.,
2985 2985 * ires[count-1] is left alone.
2986 2986 */
2987 2987 for (i = 0; i < count-1; i++) {
2988 2988 /* Remove existing parent if we need to change it */
2989 2989 if (ires[i]->ire_dep_parent != NULL &&
2990 2990 ires[i]->ire_dep_parent != ires[i+1])
2991 2991 ire_dep_remove(ires[i]);
2992 2992 }
2993 2993
2994 2994 for (i = 0; i < count - 1; i++) {
2995 2995 ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
2996 2996 ires[i]->ire_ipversion == IPV6_VERSION);
2997 2997 /* Does it need to change? */
2998 2998 if (ires[i]->ire_dep_parent != ires[i+1])
2999 2999 ire_dep_parent_insert(ires[i], ires[i+1]);
3000 3000
3001 3001 mutex_enter(&ires[i+1]->ire_lock);
3002 3002 if (IRE_IS_CONDEMNED(ires[i+1])) {
3003 3003 mutex_exit(&ires[i+1]->ire_lock);
3004 3004 rw_exit(&ipst->ips_ire_dep_lock);
3005 3005 return (B_FALSE);
3006 3006 }
3007 3007 mutex_exit(&ires[i+1]->ire_lock);
3008 3008
3009 3009 mutex_enter(&ires[i]->ire_lock);
3010 3010 ires[i]->ire_dep_parent_generation = generations[i+1];
3011 3011 mutex_exit(&ires[i]->ire_lock);
3012 3012 }
3013 3013 rw_exit(&ipst->ips_ire_dep_lock);
3014 3014 return (B_TRUE);
3015 3015 }
3016 3016
3017 3017 /*
3018 3018 * Given count worth of ires, unbuild ire_dep_* relationships
3019 3019 * from ires[0] to ires[count-1].
3020 3020 */
3021 3021 void
3022 3022 ire_dep_unbuild(ire_t *ires[], uint_t count)
3023 3023 {
3024 3024 ip_stack_t *ipst;
3025 3025 uint_t i;
3026 3026
3027 3027 if (count == 0) {
3028 3028 /* No work to do */
3029 3029 return;
3030 3030 }
3031 3031 ipst = ires[0]->ire_ipst;
3032 3032 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
3033 3033 for (i = 0; i < count; i++) {
3034 3034 ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
3035 3035 ires[i]->ire_ipversion == IPV6_VERSION);
3036 3036 if (ires[i]->ire_dep_parent != NULL)
3037 3037 ire_dep_remove(ires[i]);
3038 3038 mutex_enter(&ires[i]->ire_lock);
3039 3039 ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3040 3040 mutex_exit(&ires[i]->ire_lock);
3041 3041 }
3042 3042 rw_exit(&ipst->ips_ire_dep_lock);
3043 3043 }
3044 3044
3045 3045 /*
3046 3046 * Both the forwarding and the outbound code paths can trip on
3047 3047 * a condemned NCE, in which case we call this function.
3048 3048 * We have two different behaviors: if the NCE was UNREACHABLE
3049 3049 * it is an indication that something failed. In that case
3050 3050 * we see if we should look for a different IRE (for example,
3051 3051 * delete any matching redirect IRE, or try a different
3052 3052 * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully
3053 3053 * different IRE will be picked next time we send/forward.
3054 3054 *
3055 3055 * If we are called by the output path then fail_if_better is set
3056 3056 * and we return NULL if there could be a better IRE. This is because the
3057 3057 * output path retries the IRE lookup. (The input/forward path can not retry.)
3058 3058 *
3059 3059 * If the NCE was not unreachable then we pick/allocate a
3060 3060 * new (most likely ND_INITIAL) NCE and proceed with it.
3061 3061 *
3062 3062 * ipha/ip6h are needed for multicast packets; ipha needs to be
3063 3063 * set for IPv4 and ip6h needs to be set for IPv6 packets.
3064 3064 */
3065 3065 nce_t *
3066 3066 ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h,
3067 3067 boolean_t fail_if_better)
3068 3068 {
3069 3069 if (nce->nce_common->ncec_state == ND_UNREACHABLE) {
3070 3070 if (ire_no_good(ire) && fail_if_better) {
3071 3071 /*
3072 3072 * Did some changes, or ECMP likely to exist.
3073 3073 * Make ip_output look for a different IRE
3074 3074 */
3075 3075 return (NULL);
3076 3076 }
3077 3077 }
3078 3078 if (ire_revalidate_nce(ire) == ENETUNREACH) {
3079 3079 /* The ire_dep_parent chain went bad, or no memory? */
3080 3080 (void) ire_no_good(ire);
3081 3081 return (NULL);
3082 3082 }
3083 3083 if (ire->ire_ipversion == IPV4_VERSION) {
3084 3084 ASSERT(ipha != NULL);
3085 3085 nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
3086 3086 } else {
3087 3087 ASSERT(ip6h != NULL);
3088 3088 nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
3089 3089 }
3090 3090
3091 3091 if (nce == NULL)
3092 3092 return (NULL);
3093 3093 if (nce->nce_is_condemned) {
3094 3094 nce_refrele(nce);
3095 3095 return (NULL);
3096 3096 }
3097 3097 return (nce);
3098 3098 }
3099 3099
3100 3100 /*
3101 3101 * The caller has found that the ire is bad, either due to a reference to an NCE
3102 3102 * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved.
3103 3103 * We update things so a subsequent attempt to send to the destination
3104 3104 * is likely to find different IRE, or that a new NCE would be created.
3105 3105 *
3106 3106 * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would
3107 3107 * find a different route (either due to having deleted a redirect, or there
3108 3108 * being ECMP routes.)
3109 3109 *
3110 3110 * If we have a redirect (RTF_DYNAMIC) we delete it.
3111 3111 * Otherwise we increment ire_badcnt and increment the generation number so
3112 3112 * that a cached ixa_ire will redo the route selection. ire_badcnt is taken
3113 3113 * into account in the route selection when we have multiple choices (multiple
3114 3114 * default routes or ECMP in general).
3115 3115 * Any time ip_select_route find an ire with a condemned ire_nce_cache
3116 3116 * (e.g., if no equal cost route to the bad one) ip_select_route will make
3117 3117 * sure the NCE is revalidated to avoid getting stuck on a
3118 3118 * NCE_F_CONDMNED ncec that caused ire_no_good to be called.
3119 3119 */
3120 3120 boolean_t
3121 3121 ire_no_good(ire_t *ire)
3122 3122 {
3123 3123 ip_stack_t *ipst = ire->ire_ipst;
3124 3124 ire_t *ire2;
3125 3125 nce_t *nce;
3126 3126
3127 3127 if (ire->ire_flags & RTF_DYNAMIC) {
3128 3128 ire_delete(ire);
3129 3129 return (B_TRUE);
3130 3130 }
3131 3131 if (ire->ire_flags & RTF_INDIRECT) {
3132 3132 /* Check if next IRE is a redirect */
3133 3133 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3134 3134 if (ire->ire_dep_parent != NULL &&
3135 3135 (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) {
3136 3136 ire2 = ire->ire_dep_parent;
3137 3137 ire_refhold(ire2);
3138 3138 } else {
3139 3139 ire2 = NULL;
3140 3140 }
3141 3141 rw_exit(&ipst->ips_ire_dep_lock);
3142 3142 if (ire2 != NULL) {
3143 3143 ire_delete(ire2);
3144 3144 ire_refrele(ire2);
3145 3145 return (B_TRUE);
3146 3146 }
3147 3147 }
3148 3148 /*
3149 3149 * No redirect involved. Increment badcnt so that if we have ECMP
3150 3150 * routes we are likely to pick a different one for the next packet.
3151 3151 *
3152 3152 * If the NCE is unreachable and condemned we should drop the reference
3153 3153 * to it so that a new NCE can be created.
3154 3154 *
3155 3155 * Finally we increment the generation number so that any ixa_ire
3156 3156 * cache will be revalidated.
3157 3157 */
3158 3158 mutex_enter(&ire->ire_lock);
3159 3159 ire->ire_badcnt++;
3160 3160 ire->ire_last_badcnt = TICK_TO_SEC(ddi_get_lbolt64());
3161 3161 nce = ire->ire_nce_cache;
3162 3162 if (nce != NULL && nce->nce_is_condemned &&
3163 3163 nce->nce_common->ncec_state == ND_UNREACHABLE)
3164 3164 ire->ire_nce_cache = NULL;
3165 3165 else
3166 3166 nce = NULL;
3167 3167 mutex_exit(&ire->ire_lock);
3168 3168 if (nce != NULL)
3169 3169 nce_refrele(nce);
3170 3170
3171 3171 ire_increment_generation(ire);
3172 3172 ire_dep_incr_generation(ire);
3173 3173
3174 3174 return (ire->ire_bucket->irb_ire_cnt > 1);
3175 3175 }
3176 3176
3177 3177 /*
3178 3178 * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation ==
3179 3179 * ire_dep_parent_generation.
3180 3180 * If they all match we just return ire_generation from the topmost IRE.
3181 3181 * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation
3182 3182 * above the mismatch to IRE_GENERATION_VERIFY and also returning
3183 3183 * IRE_GENERATION_VERIFY.
3184 3184 */
3185 3185 uint_t
3186 3186 ire_dep_validate_generations(ire_t *ire)
3187 3187 {
3188 3188 ip_stack_t *ipst = ire->ire_ipst;
3189 3189 uint_t generation;
3190 3190 ire_t *ire1;
3191 3191
3192 3192 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3193 3193 generation = ire->ire_generation; /* Assuming things match */
3194 3194 for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) {
3195 3195 ASSERT(ire1->ire_ipversion == IPV4_VERSION ||
3196 3196 ire1->ire_ipversion == IPV6_VERSION);
3197 3197 if (ire1->ire_dep_parent == NULL)
3198 3198 break;
3199 3199 if (ire1->ire_dep_parent_generation !=
3200 3200 ire1->ire_dep_parent->ire_generation)
3201 3201 goto mismatch;
3202 3202 }
3203 3203 rw_exit(&ipst->ips_ire_dep_lock);
3204 3204 return (generation);
3205 3205
3206 3206 mismatch:
3207 3207 generation = IRE_GENERATION_VERIFY;
3208 3208 /* Fill from top down to the mismatch with _VERIFY */
3209 3209 while (ire != ire1) {
3210 3210 ASSERT(ire->ire_ipversion == IPV4_VERSION ||
3211 3211 ire->ire_ipversion == IPV6_VERSION);
3212 3212 mutex_enter(&ire->ire_lock);
3213 3213 ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3214 3214 mutex_exit(&ire->ire_lock);
3215 3215 ire = ire->ire_dep_parent;
3216 3216 }
3217 3217 rw_exit(&ipst->ips_ire_dep_lock);
3218 3218 return (generation);
3219 3219 }
3220 3220
3221 3221 /*
3222 3222 * Used when we need to return an ire with ire_dep_parent, but we
3223 3223 * know the chain is invalid for instance we didn't create an IRE_IF_CLONE
3224 3224 * Using IRE_GENERATION_VERIFY means that next time we'll redo the
3225 3225 * recursive lookup.
3226 3226 */
3227 3227 void
3228 3228 ire_dep_invalidate_generations(ire_t *ire)
3229 3229 {
3230 3230 ip_stack_t *ipst = ire->ire_ipst;
3231 3231
3232 3232 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3233 3233 while (ire != NULL) {
3234 3234 ASSERT(ire->ire_ipversion == IPV4_VERSION ||
3235 3235 ire->ire_ipversion == IPV6_VERSION);
3236 3236 mutex_enter(&ire->ire_lock);
3237 3237 ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3238 3238 mutex_exit(&ire->ire_lock);
3239 3239 ire = ire->ire_dep_parent;
3240 3240 }
3241 3241 rw_exit(&ipst->ips_ire_dep_lock);
3242 3242 }
3243 3243
3244 3244 /* Set _VERIFY ire_dep_parent_generation for all children recursively */
3245 3245 static void
3246 3246 ire_dep_invalidate_children(ire_t *child)
3247 3247 {
3248 3248 ip_stack_t *ipst = child->ire_ipst;
3249 3249
3250 3250 ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
3251 3251 /* Depth first */
3252 3252 if (child->ire_dep_children != NULL)
3253 3253 ire_dep_invalidate_children(child->ire_dep_children);
3254 3254
3255 3255 while (child != NULL) {
3256 3256 mutex_enter(&child->ire_lock);
3257 3257 child->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
3258 3258 mutex_exit(&child->ire_lock);
3259 3259 child = child->ire_dep_sib_next;
3260 3260 }
3261 3261 }
3262 3262
3263 3263 static void
3264 3264 ire_dep_increment_children(ire_t *child)
3265 3265 {
3266 3266 ip_stack_t *ipst = child->ire_ipst;
3267 3267
3268 3268 ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock));
3269 3269 /* Depth first */
3270 3270 if (child->ire_dep_children != NULL)
3271 3271 ire_dep_increment_children(child->ire_dep_children);
3272 3272
3273 3273 while (child != NULL) {
3274 3274 if (!IRE_IS_CONDEMNED(child))
3275 3275 ire_increment_generation(child);
3276 3276 child = child->ire_dep_sib_next;
3277 3277 }
3278 3278 }
3279 3279
3280 3280 /*
3281 3281 * Walk all the children of this ire recursively and increment their
3282 3282 * generation number.
3283 3283 */
3284 3284 static void
3285 3285 ire_dep_incr_generation_locked(ire_t *parent)
3286 3286 {
3287 3287 ASSERT(RW_READ_HELD(&parent->ire_ipst->ips_ire_dep_lock));
3288 3288 if (parent->ire_dep_children != NULL)
3289 3289 ire_dep_increment_children(parent->ire_dep_children);
3290 3290 }
3291 3291
3292 3292 void
3293 3293 ire_dep_incr_generation(ire_t *parent)
3294 3294 {
3295 3295 ip_stack_t *ipst = parent->ire_ipst;
3296 3296
3297 3297 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3298 3298 ire_dep_incr_generation_locked(parent);
3299 3299 rw_exit(&ipst->ips_ire_dep_lock);
3300 3300 }
3301 3301
3302 3302 /*
3303 3303 * Get a new ire_nce_cache for this IRE as well as its nexthop.
3304 3304 * Returns zero if it succeeds. Can fail due to lack of memory or when
3305 3305 * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those
3306 3306 * cases.
3307 3307 *
3308 3308 * In the in.mpathd case, the ire will have ire_testhidden
3309 3309 * set; so we should create the ncec for the underlying ill.
3310 3310 *
3311 3311 * Note that the error returned by ire_revalidate_nce() is ignored by most
3312 3312 * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH
3313 3313 * error to mark potentially bad ire's. For all the other callers, an
3314 3314 * error return could indicate a transient condition like ENOMEM, or could
3315 3315 * be the result of an interface that is going down/unplumbing. In the former
3316 3316 * case (transient error), we would leave the old stale ire/ire_nce_cache
3317 3317 * in place, and possibly use incorrect link-layer information to send packets
3318 3318 * but would eventually recover. In the latter case (ill down/replumb),
3319 3319 * ire_revalidate_nce() might return a condemned nce back, but we would then
3320 3320 * recover in the packet output path.
3321 3321 */
3322 3322 int
3323 3323 ire_revalidate_nce(ire_t *ire)
3324 3324 {
3325 3325 nce_t *nce, *old_nce;
3326 3326 ire_t *nexthop;
3327 3327
3328 3328 /*
3329 3329 * For multicast we conceptually have an NCE but we don't store it
3330 3330 * in ire_nce_cache; when ire_to_nce is called we allocate the nce.
3331 3331 */
3332 3332 if (ire->ire_type & IRE_MULTICAST)
3333 3333 return (0);
3334 3334
3335 3335 /* ire_testhidden should only be set on under-interfaces */
3336 3336 ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
3337 3337
3338 3338 nexthop = ire_nexthop(ire);
3339 3339 if (nexthop == NULL) {
3340 3340 /* The route is potentially bad */
3341 3341 (void) ire_no_good(ire);
3342 3342 return (ENETUNREACH);
3343 3343 }
3344 3344 if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
3345 3345 ASSERT(ire->ire_ill != NULL);
3346 3346
3347 3347 if (ire->ire_ipversion == IPV4_VERSION)
3348 3348 nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr);
3349 3349 else
3350 3350 nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6);
3351 3351 } else {
3352 3352 ASSERT(nexthop->ire_type & IRE_ONLINK);
3353 3353 if (ire->ire_ipversion == IPV4_VERSION) {
3354 3354 nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr,
3355 3355 nexthop->ire_type);
3356 3356 } else {
3357 3357 nce = ndp_nce_init(nexthop->ire_ill,
3358 3358 &nexthop->ire_addr_v6, nexthop->ire_type);
3359 3359 }
3360 3360 }
3361 3361 if (nce == NULL) {
3362 3362 /*
3363 3363 * Leave the old stale one in place to avoid a NULL
3364 3364 * ire_nce_cache.
3365 3365 */
3366 3366 ire_refrele(nexthop);
3367 3367 return (ENOMEM);
3368 3368 }
3369 3369
3370 3370 if (nexthop != ire) {
3371 3371 /* Update the nexthop ire */
3372 3372 mutex_enter(&nexthop->ire_lock);
3373 3373 old_nce = nexthop->ire_nce_cache;
3374 3374 if (!IRE_IS_CONDEMNED(nexthop)) {
3375 3375 nce_refhold(nce);
3376 3376 nexthop->ire_nce_cache = nce;
3377 3377 } else {
3378 3378 nexthop->ire_nce_cache = NULL;
3379 3379 }
3380 3380 mutex_exit(&nexthop->ire_lock);
3381 3381 if (old_nce != NULL)
3382 3382 nce_refrele(old_nce);
3383 3383 }
3384 3384 ire_refrele(nexthop);
3385 3385
3386 3386 mutex_enter(&ire->ire_lock);
3387 3387 old_nce = ire->ire_nce_cache;
3388 3388 if (!IRE_IS_CONDEMNED(ire)) {
3389 3389 nce_refhold(nce);
3390 3390 ire->ire_nce_cache = nce;
3391 3391 } else {
3392 3392 ire->ire_nce_cache = NULL;
3393 3393 }
3394 3394 mutex_exit(&ire->ire_lock);
3395 3395 if (old_nce != NULL)
3396 3396 nce_refrele(old_nce);
3397 3397
3398 3398 nce_refrele(nce);
3399 3399 return (0);
3400 3400 }
3401 3401
3402 3402 /*
3403 3403 * Get a held nce for a given ire.
3404 3404 * In the common case this is just from ire_nce_cache.
3405 3405 * For IRE_MULTICAST this needs to do an explicit lookup since we do not
3406 3406 * have an IRE_MULTICAST per address.
3407 3407 * Note that this explicitly returns CONDEMNED NCEs. The caller needs those
3408 3408 * so they can check whether the NCE went unreachable (as opposed to was
3409 3409 * condemned for some other reason).
3410 3410 */
3411 3411 nce_t *
3412 3412 ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop)
3413 3413 {
3414 3414 nce_t *nce;
3415 3415
3416 3416 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
3417 3417 return (NULL);
3418 3418
3419 3419 /* ire_testhidden should only be set on under-interfaces */
3420 3420 ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
3421 3421
3422 3422 mutex_enter(&ire->ire_lock);
3423 3423 nce = ire->ire_nce_cache;
3424 3424 if (nce != NULL) {
3425 3425 nce_refhold(nce);
3426 3426 mutex_exit(&ire->ire_lock);
3427 3427 return (nce);
3428 3428 }
3429 3429 mutex_exit(&ire->ire_lock);
3430 3430
3431 3431 if (ire->ire_type & IRE_MULTICAST) {
3432 3432 ASSERT(ire->ire_ill != NULL);
3433 3433
3434 3434 if (ire->ire_ipversion == IPV4_VERSION) {
3435 3435 ASSERT(v6nexthop == NULL);
3436 3436
3437 3437 nce = arp_nce_init(ire->ire_ill, v4nexthop,
3438 3438 ire->ire_type);
3439 3439 } else {
3440 3440 ASSERT(v6nexthop != NULL);
3441 3441 ASSERT(v4nexthop == 0);
3442 3442 nce = ndp_nce_init(ire->ire_ill, v6nexthop,
3443 3443 ire->ire_type);
3444 3444 }
3445 3445 return (nce);
3446 3446 }
3447 3447 return (NULL);
3448 3448 }
3449 3449
3450 3450 nce_t *
3451 3451 ire_to_nce_pkt(ire_t *ire, mblk_t *mp)
3452 3452 {
3453 3453 ipha_t *ipha;
3454 3454 ip6_t *ip6h;
3455 3455
3456 3456 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
3457 3457 ipha = (ipha_t *)mp->b_rptr;
3458 3458 return (ire_to_nce(ire, ipha->ipha_dst, NULL));
3459 3459 } else {
3460 3460 ip6h = (ip6_t *)mp->b_rptr;
3461 3461 return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst));
3462 3462 }
3463 3463 }
3464 3464
3465 3465 /*
3466 3466 * Given an IRE_INTERFACE (that matches more than one address) create
3467 3467 * and return an IRE_IF_CLONE for the specific address.
3468 3468 * Return the generation number.
3469 3469 * Returns NULL is no memory for the IRE.
3470 3470 * Handles both IPv4 and IPv6.
3471 3471 *
3472 3472 * IRE_IF_CLONE entries may only be created adn added by calling
3473 3473 * ire_create_if_clone(), and we depend on the fact that ire_add will
3474 3474 * atomically ensure that attempts to add multiple identical IRE_IF_CLONE
3475 3475 * entries will not result in duplicate (i.e., ire_identical_ref > 1)
3476 3476 * CLONE entries, so that a single ire_delete is sufficient to remove the
3477 3477 * CLONE.
3478 3478 */
3479 3479 ire_t *
3480 3480 ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp)
3481 3481 {
3482 3482 ire_t *ire;
3483 3483 ire_t *nire;
3484 3484
3485 3485 if (ire_if->ire_ipversion == IPV4_VERSION) {
3486 3486 ipaddr_t v4addr;
3487 3487 ipaddr_t mask = IP_HOST_MASK;
3488 3488
3489 3489 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
3490 3490 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
3491 3491
3492 3492 ire = ire_create(
3493 3493 (uchar_t *)&v4addr, /* dest address */
3494 3494 (uchar_t *)&mask, /* mask */
3495 3495 (uchar_t *)&ire_if->ire_gateway_addr,
3496 3496 IRE_IF_CLONE, /* IRE type */
3497 3497 ire_if->ire_ill,
3498 3498 ire_if->ire_zoneid,
3499 3499 ire_if->ire_flags | RTF_HOST,
3500 3500 NULL, /* No security attr for IRE_IF_ALL */
3501 3501 ire_if->ire_ipst);
3502 3502 } else {
3503 3503 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
3504 3504 ire = ire_create_v6(
3505 3505 addr, /* dest address */
3506 3506 &ipv6_all_ones, /* mask */
3507 3507 &ire_if->ire_gateway_addr_v6, /* gateway addr */
3508 3508 IRE_IF_CLONE, /* IRE type */
3509 3509 ire_if->ire_ill,
3510 3510 ire_if->ire_zoneid,
3511 3511 ire_if->ire_flags | RTF_HOST,
3512 3512 NULL, /* No security attr for IRE_IF_ALL */
3513 3513 ire_if->ire_ipst);
3514 3514 }
3515 3515 if (ire == NULL)
3516 3516 return (NULL);
3517 3517
3518 3518 /* Take the metrics, in particular the mtu, from the IRE_IF */
3519 3519 ire->ire_metrics = ire_if->ire_metrics;
3520 3520
3521 3521 nire = ire_add(ire);
3522 3522 if (nire == NULL) /* Some failure */
3523 3523 return (NULL);
3524 3524
3525 3525 if (generationp != NULL)
3526 3526 *generationp = nire->ire_generation;
3527 3527
3528 3528 return (nire);
3529 3529 }
3530 3530
3531 3531 /*
3532 3532 * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the
3533 3533 * ire_dep_children (just walk the ire_dep_sib_next since they are all
3534 3534 * immediate children.)
3535 3535 * Since we hold a lock while we remove them we need to defer the actual
3536 3536 * calls to ire_delete() until we have dropped the lock. This makes things
3537 3537 * less efficient since we restart at the top after dropping the lock. But
3538 3538 * we only run when an IRE_INTERFACE is deleted which is infrquent.
3539 3539 *
3540 3540 * Note that ire_dep_children can be any mixture of offlink routes and
3541 3541 * IRE_IF_CLONE entries.
3542 3542 */
3543 3543 void
3544 3544 ire_dep_delete_if_clone(ire_t *parent)
3545 3545 {
3546 3546 ip_stack_t *ipst = parent->ire_ipst;
3547 3547 ire_t *child, *next;
3548 3548
3549 3549 restart:
3550 3550 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
3551 3551 if (parent->ire_dep_children == NULL) {
3552 3552 rw_exit(&ipst->ips_ire_dep_lock);
3553 3553 return;
3554 3554 }
3555 3555 child = parent->ire_dep_children;
3556 3556 while (child != NULL) {
3557 3557 next = child->ire_dep_sib_next;
3558 3558 if ((child->ire_type & IRE_IF_CLONE) &&
3559 3559 !IRE_IS_CONDEMNED(child)) {
3560 3560 ire_refhold(child);
3561 3561 rw_exit(&ipst->ips_ire_dep_lock);
3562 3562 ire_delete(child);
3563 3563 ASSERT(IRE_IS_CONDEMNED(child));
3564 3564 ire_refrele(child);
3565 3565 goto restart;
3566 3566 }
3567 3567 child = next;
3568 3568 }
3569 3569 rw_exit(&ipst->ips_ire_dep_lock);
3570 3570 }
3571 3571
3572 3572 /*
3573 3573 * In the preferred/strict src multihoming modes, unbound routes (i.e.,
3574 3574 * ire_t entries with ire_unbound set to B_TRUE) are bound to an interface
3575 3575 * by selecting the first available interface that has an interface route for
3576 3576 * the ire_gateway. If that interface is subsequently brought down, ill_downi()
3577 3577 * will call ire_rebind() so that the unbound route can be bound to some other
3578 3578 * matching interface thereby preserving the intended reachability information
3579 3579 * from the original unbound route.
3580 3580 */
3581 3581 void
3582 3582 ire_rebind(ire_t *ire)
3583 3583 {
3584 3584 ire_t *gw_ire, *new_ire;
3585 3585 int match_flags = MATCH_IRE_TYPE;
3586 3586 ill_t *gw_ill;
3587 3587 boolean_t isv6 = (ire->ire_ipversion == IPV6_VERSION);
3588 3588 ip_stack_t *ipst = ire->ire_ipst;
3589 3589
3590 3590 ASSERT(ire->ire_unbound);
3591 3591 again:
3592 3592 if (isv6) {
3593 3593 gw_ire = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0,
3594 3594 IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
3595 3595 ipst, NULL);
3596 3596 } else {
3597 3597 gw_ire = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
3598 3598 IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0,
3599 3599 ipst, NULL);
3600 3600 }
3601 3601 if (gw_ire == NULL) {
3602 3602 /* see comments in ip_rt_add[_v6]() for IPMP */
3603 3603 if (match_flags & MATCH_IRE_TESTHIDDEN)
3604 3604 return;
3605 3605
3606 3606 match_flags |= MATCH_IRE_TESTHIDDEN;
3607 3607 goto again;
3608 3608 }
3609 3609 gw_ill = gw_ire->ire_ill;
3610 3610 if (isv6) {
3611 3611 new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
3612 3612 &ire->ire_gateway_addr_v6, ire->ire_type, gw_ill,
3613 3613 ire->ire_zoneid, ire->ire_flags, NULL, ipst);
3614 3614 } else {
3615 3615 new_ire = ire_create((uchar_t *)&ire->ire_addr,
3616 3616 (uchar_t *)&ire->ire_mask,
3617 3617 (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, gw_ill,
3618 3618 ire->ire_zoneid, ire->ire_flags, NULL, ipst);
3619 3619 }
3620 3620 ire_refrele(gw_ire);
3621 3621 if (new_ire == NULL)
3622 3622 return;
3623 3623 new_ire->ire_unbound = B_TRUE;
3624 3624 new_ire = ire_add(new_ire);
3625 3625 if (new_ire != NULL)
3626 3626 ire_refrele(new_ire);
3627 3627 }
↓ open down ↓ |
996 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX