1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/conf.h>
  29 #include <sys/time.h>
  30 #include <sys/taskq.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/sdt.h>
  33 #include <sys/atomic.h>
  34 #include <netinet/in.h>
  35 #include <inet/ip.h>
  36 #include <inet/ip6.h>
  37 #include <inet/tcp.h>
  38 #include <inet/udp_impl.h>
  39 #include <inet/ilb.h>
  40 
  41 #include "ilb_stack.h"
  42 #include "ilb_impl.h"
  43 #include "ilb_conn.h"
  44 #include "ilb_nat.h"
  45 
  46 /*
  47  * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
  48  *
  49  * start: starting index into the hash table to do gc
  50  * end: ending index into the hash table to do gc
  51  * ilbs: pointer to the ilb_stack_t of the IP stack
  52  * tid_lock: mutex to protect the timer id.
  53  * tid: timer id of the timer
  54  */
  55 typedef struct ilb_timer_s {
  56         uint32_t        start;
  57         uint32_t        end;
  58         ilb_stack_t     *ilbs;
  59         kmutex_t        tid_lock;
  60         timeout_id_t    tid;
  61 } ilb_timer_t;
  62 
  63 /* Hash macro for finding the index to the conn hash table */
  64 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size)    \
  65         (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 +           \
  66         (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 +              \
  67         (*((saddr) + 1) ^ *((daddr) + 1)) * 37 +                \
  68         (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) &   \
  69         ((hash_size) - 1))
  70 
  71 /* Kmem cache for the conn hash entry */
  72 static struct kmem_cache *ilb_conn_cache = NULL;
  73 
  74 /*
  75  * There are 60 timers running to do conn cache garbage collection.  Each
  76  * gc thread is responsible for 1/60 of the conn hash table.
  77  */
  78 static int ilb_conn_timer_size = 60;
  79 
  80 /* Each of the above gc timers wake up every 15s to do the gc. */
  81 static int ilb_conn_cache_timeout = 15;
  82 
  83 #define ILB_STICKY_HASH(saddr, rule, hash_size)                 \
  84         (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 +             \
  85         (*((saddr) + 2) ^ ((rule) >> 16)) * 961 +         \
  86         (*((saddr) + 1) ^ ((rule) >> 8)) * 31 +                   \
  87         (*(saddr) ^ (rule))) & ((hash_size) - 1))
  88 
  89 static struct kmem_cache *ilb_sticky_cache = NULL;
  90 
  91 /*
  92  * There are 60 timers running to do sticky cache garbage collection.  Each
  93  * gc thread is responsible for 1/60 of the sticky hash table.
  94  */
  95 static int ilb_sticky_timer_size = 60;
  96 
  97 /* Each of the above gc timers wake up every 15s to do the gc. */
  98 static int ilb_sticky_timeout = 15;
  99 
 100 #define ILB_STICKY_REFRELE(s)                   \
 101 {                                               \
 102         mutex_enter(&(s)->hash->sticky_lock); \
 103         (s)->refcnt--;                               \
 104         (s)->atime = ddi_get_lbolt64();              \
 105         mutex_exit(&s->hash->sticky_lock);    \
 106 }
 107 
 108 
 109 static void
 110 ilb_conn_cache_init(void)
 111 {
 112         ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
 113             sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
 114             ilb_kmem_flags);
 115 }
 116 
 117 void
 118 ilb_conn_cache_fini(void)
 119 {
 120         if (ilb_conn_cache != NULL) {
 121                 kmem_cache_destroy(ilb_conn_cache);
 122                 ilb_conn_cache = NULL;
 123         }
 124 }
 125 
 126 static void
 127 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
 128 {
 129         ilb_conn_hash_t *hash;
 130         ilb_conn_t **next, **prev;
 131         ilb_conn_t **next_prev, **prev_next;
 132 
 133         if (c2s) {
 134                 hash = connp->conn_c2s_hash;
 135                 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
 136                 next = &connp->conn_c2s_next;
 137                 prev = &connp->conn_c2s_prev;
 138                 if (*next != NULL)
 139                         next_prev = &(*next)->conn_c2s_prev;
 140                 if (*prev != NULL)
 141                         prev_next = &(*prev)->conn_c2s_next;
 142         } else {
 143                 hash = connp->conn_s2c_hash;
 144                 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
 145                 next = &connp->conn_s2c_next;
 146                 prev = &connp->conn_s2c_prev;
 147                 if (*next != NULL)
 148                         next_prev = &(*next)->conn_s2c_prev;
 149                 if (*prev != NULL)
 150                         prev_next = &(*prev)->conn_s2c_next;
 151         }
 152 
 153         if (hash->ilb_connp == connp) {
 154                 hash->ilb_connp = *next;
 155                 if (*next != NULL)
 156                         *next_prev = NULL;
 157         } else {
 158                 if (*prev != NULL)
 159                         *prev_next = *next;
 160                 if (*next != NULL)
 161                         *next_prev = *prev;
 162         }
 163         ASSERT(hash->ilb_conn_cnt > 0);
 164         hash->ilb_conn_cnt--;
 165 
 166         *next = NULL;
 167         *prev = NULL;
 168 }
 169 
 170 static void
 171 ilb_conn_remove(ilb_conn_t *connp)
 172 {
 173         ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
 174         ilb_conn_remove_common(connp, B_TRUE);
 175         ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
 176         ilb_conn_remove_common(connp, B_FALSE);
 177 
 178         if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
 179                 in_port_t port;
 180 
 181                 port = ntohs(connp->conn_rule_cache.info.nat_sport);
 182                 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
 183                     (void *)(uintptr_t)port, 1);
 184         }
 185 
 186         if (connp->conn_sticky != NULL)
 187                 ILB_STICKY_REFRELE(connp->conn_sticky);
 188         ILB_SERVER_REFRELE(connp->conn_server);
 189         kmem_cache_free(ilb_conn_cache, connp);
 190 }
 191 
 192 /*
 193  * Routine to do periodic garbage collection of conn hash entries.  When
 194  * a conn hash timer fires, it dispatches a taskq to call this function
 195  * to do the gc.  Note that each taskq is responisble for a portion of
 196  * the table.  The portion is stored in timer->start, timer->end.
 197  */
 198 static void
 199 ilb_conn_cleanup(void *arg)
 200 {
 201         ilb_timer_t *timer = (ilb_timer_t *)arg;
 202         uint32_t i;
 203         ilb_stack_t *ilbs;
 204         ilb_conn_hash_t *c2s_hash, *s2c_hash;
 205         ilb_conn_t *connp, *nxt_connp;
 206         int64_t now;
 207         int64_t expiry;
 208         boolean_t die_now;
 209 
 210         ilbs = timer->ilbs;
 211         c2s_hash = ilbs->ilbs_c2s_conn_hash;
 212         ASSERT(c2s_hash != NULL);
 213 
 214         now = ddi_get_lbolt64();
 215         for (i = timer->start; i < timer->end; i++) {
 216                 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
 217                 if ((connp = c2s_hash[i].ilb_connp) == NULL) {
 218                         ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
 219                         mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
 220                         continue;
 221                 }
 222                 do {
 223                         ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
 224                         ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
 225                         nxt_connp = connp->conn_c2s_next;
 226                         expiry = now - SEC_TO_TICK(connp->conn_expiry);
 227                         if (connp->conn_server->iser_die_time != 0 &&
 228                             connp->conn_server->iser_die_time < now)
 229                                 die_now = B_TRUE;
 230                         else
 231                                 die_now = B_FALSE;
 232                         s2c_hash = connp->conn_s2c_hash;
 233                         mutex_enter(&s2c_hash->ilb_conn_hash_lock);
 234 
 235                         if (connp->conn_gc || die_now ||
 236                             (connp->conn_c2s_atime < expiry &&
 237                             connp->conn_s2c_atime < expiry)) {
 238                                 /* Need to update the nat list cur_connp */
 239                                 if (connp == ilbs->ilbs_conn_list_connp) {
 240                                         ilbs->ilbs_conn_list_connp =
 241                                             connp->conn_c2s_next;
 242                                 }
 243                                 ilb_conn_remove(connp);
 244                                 goto nxt_connp;
 245                         }
 246 
 247                         if (connp->conn_l4 != IPPROTO_TCP)
 248                                 goto nxt_connp;
 249 
 250                         /* Update and check TCP related conn info */
 251                         if (connp->conn_c2s_tcp_fin_sent &&
 252                             SEQ_GT(connp->conn_s2c_tcp_ack,
 253                             connp->conn_c2s_tcp_fss)) {
 254                                 connp->conn_c2s_tcp_fin_acked = B_TRUE;
 255                         }
 256                         if (connp->conn_s2c_tcp_fin_sent &&
 257                             SEQ_GT(connp->conn_c2s_tcp_ack,
 258                             connp->conn_s2c_tcp_fss)) {
 259                                 connp->conn_s2c_tcp_fin_acked = B_TRUE;
 260                         }
 261                         if (connp->conn_c2s_tcp_fin_acked &&
 262                             connp->conn_s2c_tcp_fin_acked) {
 263                                 ilb_conn_remove(connp);
 264                         }
 265 nxt_connp:
 266                         mutex_exit(&s2c_hash->ilb_conn_hash_lock);
 267                         connp = nxt_connp;
 268                 } while (connp != NULL);
 269                 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
 270         }
 271 }
 272 
 273 /* Conn hash timer routine.  It dispatches a taskq and restart the timer */
 274 static void
 275 ilb_conn_timer(void *arg)
 276 {
 277         ilb_timer_t *timer = (ilb_timer_t *)arg;
 278 
 279         (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
 280             arg, TQ_SLEEP);
 281         mutex_enter(&timer->tid_lock);
 282         if (timer->tid == 0) {
 283                 mutex_exit(&timer->tid_lock);
 284         } else {
 285                 timer->tid = timeout(ilb_conn_timer, arg,
 286                     SEC_TO_TICK(ilb_conn_cache_timeout));
 287                 mutex_exit(&timer->tid_lock);
 288         }
 289 }
 290 
 291 void
 292 ilb_conn_hash_init(ilb_stack_t *ilbs)
 293 {
 294         extern pri_t minclsyspri;
 295         int i, part;
 296         ilb_timer_t *tm;
 297         char tq_name[TASKQ_NAMELEN];
 298 
 299         /*
 300          * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
 301          * the next power of 2.
 302          */
 303         if (ilbs->ilbs_conn_hash_size & (ilbs->ilbs_conn_hash_size - 1)) {
 304                 for (i = 0; i < 31; i++) {
 305                         if (ilbs->ilbs_conn_hash_size < (1 << i))
 306                                 break;
 307                 }
 308                 ilbs->ilbs_conn_hash_size = 1 << i;
 309         }
 310 
 311         /*
 312          * Can sleep since this should be called when a rule is being added,
 313          * hence we are not in interrupt context.
 314          */
 315         ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
 316             ilbs->ilbs_conn_hash_size, KM_SLEEP);
 317         ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
 318             ilbs->ilbs_conn_hash_size, KM_SLEEP);
 319 
 320         for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
 321                 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
 322                     NULL, MUTEX_DEFAULT, NULL);
 323         }
 324         for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
 325                 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
 326                     NULL, MUTEX_DEFAULT, NULL);
 327         }
 328 
 329         if (ilb_conn_cache == NULL)
 330                 ilb_conn_cache_init();
 331 
 332         (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
 333             (void *)ilbs->ilbs_netstack);
 334         ASSERT(ilbs->ilbs_conn_taskq == NULL);
 335         ilbs->ilbs_conn_taskq = taskq_create(tq_name,
 336             ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
 337             ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 338 
 339         ASSERT(ilbs->ilbs_conn_timer_list == NULL);
 340         ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
 341             ilb_conn_timer_size, KM_SLEEP);
 342 
 343         /*
 344          * The hash table is divided in equal partition for those timers
 345          * to do garbage collection.
 346          */
 347         part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
 348         for (i = 0; i < ilb_conn_timer_size; i++) {
 349                 tm = ilbs->ilbs_conn_timer_list + i;
 350                 tm->start = i * part;
 351                 tm->end = i * part + part;
 352                 if (tm->end > ilbs->ilbs_conn_hash_size)
 353                         tm->end = ilbs->ilbs_conn_hash_size;
 354                 tm->ilbs = ilbs;
 355                 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
 356                 /* Spread out the starting execution time of all the timers. */
 357                 tm->tid = timeout(ilb_conn_timer, tm,
 358                     SEC_TO_TICK(ilb_conn_cache_timeout + i));
 359         }
 360 }
 361 
 362 void
 363 ilb_conn_hash_fini(ilb_stack_t *ilbs)
 364 {
 365         uint32_t i;
 366         ilb_conn_t *connp;
 367 
 368         if (ilbs->ilbs_c2s_conn_hash == NULL) {
 369                 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
 370                 return;
 371         }
 372 
 373         /* Stop all the timers first. */
 374         for (i = 0; i < ilb_conn_timer_size; i++) {
 375                 timeout_id_t tid;
 376 
 377                 /* Setting tid to 0 tells the timer handler not to restart. */
 378                 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
 379                 tid = ilbs->ilbs_conn_timer_list[i].tid;
 380                 ilbs->ilbs_conn_timer_list[i].tid = 0;
 381                 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
 382                 (void) untimeout(tid);
 383         }
 384         kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
 385             ilb_conn_timer_size);
 386         taskq_destroy(ilbs->ilbs_conn_taskq);
 387         ilbs->ilbs_conn_taskq = NULL;
 388 
 389         /* Then remove all the conns. */
 390         for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
 391                 while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) {
 392                         ilbs->ilbs_s2c_conn_hash->ilb_connp =
 393                             connp->conn_s2c_next;
 394                         ILB_SERVER_REFRELE(connp->conn_server);
 395                         if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
 396                                 ilb_nat_src_entry_t *ent;
 397                                 in_port_t port;
 398 
 399                                 /*
 400                                  * src_ent will be freed in ilb_nat_src_fini().
 401                                  */
 402                                 port = ntohs(
 403                                     connp->conn_rule_cache.info.nat_sport);
 404                                 ent = connp->conn_rule_cache.info.src_ent;
 405                                 vmem_free(ent->nse_port_arena,
 406                                     (void *)(uintptr_t)port, 1);
 407                         }
 408                         kmem_cache_free(ilb_conn_cache, connp);
 409                 }
 410         }
 411         kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
 412             ilbs->ilbs_conn_hash_size);
 413         kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
 414             ilbs->ilbs_conn_hash_size);
 415 }
 416 
 417 /*
 418  * Internet checksum adjustment calculation routines.  We pre-calculate
 419  * checksum adjustment so that we don't need to compute the checksum on
 420  * the whole packet when we change address/port in the packet.
 421  */
 422 
 423 static void
 424 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
 425     in_port_t new_port, uint32_t *adj_sum)
 426 {
 427         uint32_t sum;
 428 
 429         sum = *oaddr + *(oaddr + 1) + old_port;
 430         while ((sum >> 16) != 0)
 431                 sum = (sum & 0xffff) + (sum >> 16);
 432         *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
 433 }
 434 
 435 static void
 436 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
 437     in_port_t new_port, uint32_t *adj_sum)
 438 {
 439         uint32_t sum = 0;
 440 
 441         sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
 442             *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
 443             old_port;
 444         while ((sum >> 16) != 0)
 445                 sum = (sum & 0xffff) + (sum >> 16);
 446         *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
 447             *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
 448             *(naddr + 6) + *(naddr + 7) + new_port;
 449 }
 450 
 451 static void
 452 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
 453     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
 454     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
 455 {
 456         uint32_t sum;
 457 
 458         sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
 459             old_port2;
 460         while ((sum >> 16) != 0)
 461                 sum = (sum & 0xffff) + (sum >> 16);
 462         *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
 463             *naddr2 + *(naddr2 + 1) + new_port2;
 464 }
 465 
 466 static void
 467 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
 468     uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
 469     in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
 470 {
 471         uint32_t sum = 0;
 472 
 473         sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
 474             *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
 475             old_port1;
 476         sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
 477             *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
 478             old_port2;
 479         while ((sum >> 16) != 0)
 480                 sum = (sum & 0xffff) + (sum >> 16);
 481         sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
 482             *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
 483             *(naddr1 + 7) + new_port1;
 484         *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
 485             *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
 486             *(naddr2 + 7) + new_port2;
 487 }
 488 
 489 /*
 490  * Add a conn hash entry to the tables.  Note that a conn hash entry
 491  * (ilb_conn_t) contains info on both directions.  And there are two hash
 492  * tables, one for client to server and the other for server to client.
 493  * So the same entry is added to both tables and can be ccessed by two
 494  * thread simultaneously.  But each thread will only access data on one
 495  * direction, so there is no conflict.
 496  */
 497 int
 498 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
 499     in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
 500     ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
 501 {
 502         ilb_conn_t *connp;
 503         ilb_conn_hash_t *hash;
 504         int i;
 505 
 506         connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
 507         if (connp == NULL) {
 508                 if (s != NULL) {
 509                         if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
 510                                 ilb_nat_src_entry_t **entry;
 511 
 512                                 entry = s->server->iser_nat_src->src_list;
 513                                 vmem_free(entry[s->nat_src_idx]->nse_port_arena,
 514                                     (void *)(uintptr_t)ntohs(info->nat_sport),
 515                                     1);
 516                         }
 517                         ILB_STICKY_REFRELE(s);
 518                 }
 519                 return (ENOMEM);
 520         }
 521 
 522         connp->conn_l4 = rule->ir_proto;
 523 
 524         connp->conn_server = server;
 525         ILB_SERVER_REFHOLD(server);
 526         connp->conn_sticky = s;
 527 
 528         connp->conn_rule_cache.topo = rule->ir_topo;
 529         connp->conn_rule_cache.info = *info;
 530 
 531         connp->conn_gc = B_FALSE;
 532 
 533         connp->conn_expiry = rule->ir_nat_expiry;
 534         connp->conn_cr_time = ddi_get_lbolt64();
 535 
 536         /* Client to server info. */
 537         connp->conn_c2s_saddr = *src;
 538         connp->conn_c2s_sport = sport;
 539         connp->conn_c2s_daddr = *dst;
 540         connp->conn_c2s_dport = dport;
 541 
 542         connp->conn_c2s_atime = ddi_get_lbolt64();
 543         /* The packet ths triggers this creation should be counted */
 544         connp->conn_c2s_pkt_cnt = 1;
 545         connp->conn_c2s_tcp_fin_sent = B_FALSE;
 546         connp->conn_c2s_tcp_fin_acked = B_FALSE;
 547 
 548         /* Server to client info, before NAT */
 549         switch (rule->ir_topo) {
 550         case ILB_TOPO_IMPL_HALF_NAT:
 551                 connp->conn_s2c_saddr = info->nat_dst;
 552                 connp->conn_s2c_sport = info->nat_dport;
 553                 connp->conn_s2c_daddr = *src;
 554                 connp->conn_s2c_dport = sport;
 555 
 556                 /* Pre-calculate checksum changes for both directions */
 557                 if (rule->ir_ipver == IPPROTO_IP) {
 558                         hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
 559                             (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
 560                             &connp->conn_c2s_ip_sum);
 561                         hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
 562                             (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
 563                             info->nat_dport, &connp->conn_c2s_tp_sum);
 564                         *ip_sum = connp->conn_c2s_ip_sum;
 565                         *tp_sum = connp->conn_c2s_tp_sum;
 566 
 567                         hnat_cksum_v4(
 568                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 569                             (uint16_t *)&dst->s6_addr32[3], 0, 0,
 570                             &connp->conn_s2c_ip_sum);
 571                         hnat_cksum_v4(
 572                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 573                             (uint16_t *)&dst->s6_addr32[3],
 574                             info->nat_dport, dport,
 575                             &connp->conn_s2c_tp_sum);
 576                 } else {
 577                         connp->conn_c2s_ip_sum = 0;
 578                         hnat_cksum_v6((uint16_t *)dst,
 579                             (uint16_t *)&info->nat_dst, dport,
 580                             info->nat_dport, &connp->conn_c2s_tp_sum);
 581                         *ip_sum = 0;
 582                         *tp_sum = connp->conn_c2s_tp_sum;
 583 
 584                         connp->conn_s2c_ip_sum = 0;
 585                         hnat_cksum_v6((uint16_t *)&info->nat_dst,
 586                             (uint16_t *)dst, info->nat_dport, dport,
 587                             &connp->conn_s2c_tp_sum);
 588                 }
 589                 break;
 590         case ILB_TOPO_IMPL_NAT:
 591                 connp->conn_s2c_saddr = info->nat_dst;
 592                 connp->conn_s2c_sport = info->nat_dport;
 593                 connp->conn_s2c_daddr = info->nat_src;
 594                 connp->conn_s2c_dport = info->nat_sport;
 595 
 596                 if (rule->ir_ipver == IPPROTO_IP) {
 597                         fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
 598                             (uint16_t *)&dst->s6_addr32[3],
 599                             (uint16_t *)&info->nat_src.s6_addr32[3],
 600                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 601                             0, 0, 0, 0, &connp->conn_c2s_ip_sum);
 602                         fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
 603                             (uint16_t *)&dst->s6_addr32[3],
 604                             (uint16_t *)&info->nat_src.s6_addr32[3],
 605                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 606                             sport, dport, info->nat_sport,
 607                             info->nat_dport, &connp->conn_c2s_tp_sum);
 608                         *ip_sum = connp->conn_c2s_ip_sum;
 609                         *tp_sum = connp->conn_c2s_tp_sum;
 610 
 611                         fnat_cksum_v4(
 612                             (uint16_t *)&info->nat_src.s6_addr32[3],
 613                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 614                             (uint16_t *)&src->s6_addr32[3],
 615                             (uint16_t *)&dst->s6_addr32[3],
 616                             0, 0, 0, 0, &connp->conn_s2c_ip_sum);
 617                         fnat_cksum_v4(
 618                             (uint16_t *)&info->nat_src.s6_addr32[3],
 619                             (uint16_t *)&info->nat_dst.s6_addr32[3],
 620                             (uint16_t *)&src->s6_addr32[3],
 621                             (uint16_t *)&dst->s6_addr32[3],
 622                             info->nat_sport, info->nat_dport,
 623                             sport, dport, &connp->conn_s2c_tp_sum);
 624                 } else {
 625                         fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
 626                             (uint16_t *)&info->nat_src,
 627                             (uint16_t *)&info->nat_dst,
 628                             sport, dport, info->nat_sport,
 629                             info->nat_dport, &connp->conn_c2s_tp_sum);
 630                         connp->conn_c2s_ip_sum = 0;
 631                         *ip_sum = 0;
 632                         *tp_sum = connp->conn_c2s_tp_sum;
 633 
 634                         fnat_cksum_v6((uint16_t *)&info->nat_src,
 635                             (uint16_t *)&info->nat_dst, (uint16_t *)src,
 636                             (uint16_t *)dst, info->nat_sport,
 637                             info->nat_dport, sport, dport,
 638                             &connp->conn_s2c_tp_sum);
 639                         connp->conn_s2c_ip_sum = 0;
 640                 }
 641                 break;
 642         }
 643 
 644         connp->conn_s2c_atime = ddi_get_lbolt64();
 645         connp->conn_s2c_pkt_cnt = 1;
 646         connp->conn_s2c_tcp_fin_sent = B_FALSE;
 647         connp->conn_s2c_tcp_fin_acked = B_FALSE;
 648 
 649         /* Add it to the s2c hash table. */
 650         hash = ilbs->ilbs_s2c_conn_hash;
 651         i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
 652             ntohs(connp->conn_s2c_sport),
 653             (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
 654             ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
 655         connp->conn_s2c_hash = &hash[i];
 656         DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
 657 
 658         mutex_enter(&hash[i].ilb_conn_hash_lock);
 659         hash[i].ilb_conn_cnt++;
 660         connp->conn_s2c_next = hash[i].ilb_connp;
 661         if (hash[i].ilb_connp != NULL)
 662                 hash[i].ilb_connp->conn_s2c_prev = connp;
 663         connp->conn_s2c_prev = NULL;
 664         hash[i].ilb_connp = connp;
 665         mutex_exit(&hash[i].ilb_conn_hash_lock);
 666 
 667         /* Add it to the c2s hash table. */
 668         hash = ilbs->ilbs_c2s_conn_hash;
 669         i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
 670             (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
 671             ilbs->ilbs_conn_hash_size);
 672         connp->conn_c2s_hash = &hash[i];
 673         DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
 674 
 675         mutex_enter(&hash[i].ilb_conn_hash_lock);
 676         hash[i].ilb_conn_cnt++;
 677         connp->conn_c2s_next = hash[i].ilb_connp;
 678         if (hash[i].ilb_connp != NULL)
 679                 hash[i].ilb_connp->conn_c2s_prev = connp;
 680         connp->conn_c2s_prev = NULL;
 681         hash[i].ilb_connp = connp;
 682         mutex_exit(&hash[i].ilb_conn_hash_lock);
 683 
 684         return (0);
 685 }
 686 
 687 /*
 688  * If a connection is using TCP, we keep track of simple TCP state transition
 689  * so that we know when to clean up an entry.
 690  */
 691 static boolean_t
 692 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
 693     boolean_t c2s)
 694 {
 695         uint32_t ack, seq;
 696         int32_t seg_len;
 697 
 698         if (tcpha->tha_flags & TH_RST)
 699                 return (B_FALSE);
 700 
 701         seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
 702             TCP_HDR_LENGTH((tcph_t *)tcpha);
 703 
 704         if (tcpha->tha_flags & TH_ACK)
 705                 ack = ntohl(tcpha->tha_ack);
 706         seq = ntohl(tcpha->tha_seq);
 707         if (c2s) {
 708                 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
 709                 if (tcpha->tha_flags & TH_FIN) {
 710                         connp->conn_c2s_tcp_fss = seq + seg_len;
 711                         connp->conn_c2s_tcp_fin_sent = B_TRUE;
 712                 }
 713                 connp->conn_c2s_tcp_ack = ack;
 714 
 715                 /* Port reuse by the client, restart the conn. */
 716                 if (connp->conn_c2s_tcp_fin_sent &&
 717                     SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
 718                         connp->conn_c2s_tcp_fin_sent = B_FALSE;
 719                         connp->conn_c2s_tcp_fin_acked = B_FALSE;
 720                 }
 721         } else {
 722                 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
 723                 if (tcpha->tha_flags & TH_FIN) {
 724                         connp->conn_s2c_tcp_fss = seq + seg_len;
 725                         connp->conn_s2c_tcp_fin_sent = B_TRUE;
 726                 }
 727                 connp->conn_s2c_tcp_ack = ack;
 728 
 729                 /* Port reuse by the client, restart the conn. */
 730                 if (connp->conn_s2c_tcp_fin_sent &&
 731                     SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
 732                         connp->conn_s2c_tcp_fin_sent = B_FALSE;
 733                         connp->conn_s2c_tcp_fin_acked = B_FALSE;
 734                 }
 735         }
 736 
 737         return (B_TRUE);
 738 }
 739 
 740 /*
 741  * Helper routint to find conn hash entry given some packet information and
 742  * the traffic direction (c2s, client to server?)
 743  */
 744 static boolean_t
 745 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
 746     in_port_t sport, in6_addr_t *dst, in_port_t dport,
 747     ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
 748     int32_t pkt_len, boolean_t c2s)
 749 {
 750         ilb_conn_hash_t *hash;
 751         uint_t i;
 752         ilb_conn_t *connp;
 753         boolean_t tcp_alive;
 754         boolean_t ret = B_FALSE;
 755 
 756         i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
 757             (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
 758             ilbs->ilbs_conn_hash_size);
 759         if (c2s) {
 760                 hash = ilbs->ilbs_c2s_conn_hash;
 761                 mutex_enter(&hash[i].ilb_conn_hash_lock);
 762                 for (connp = hash[i].ilb_connp; connp != NULL;
 763                     connp = connp->conn_c2s_next) {
 764                         if (connp->conn_l4 == l4 &&
 765                             connp->conn_c2s_dport == dport &&
 766                             connp->conn_c2s_sport == sport &&
 767                             IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
 768                             IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
 769                                 connp->conn_c2s_atime = ddi_get_lbolt64();
 770                                 connp->conn_c2s_pkt_cnt++;
 771                                 *rule_cache = connp->conn_rule_cache;
 772                                 *ip_sum = connp->conn_c2s_ip_sum;
 773                                 *tp_sum = connp->conn_c2s_tp_sum;
 774                                 ret = B_TRUE;
 775                                 break;
 776                         }
 777                 }
 778         } else {
 779                 hash = ilbs->ilbs_s2c_conn_hash;
 780                 mutex_enter(&hash[i].ilb_conn_hash_lock);
 781                 for (connp = hash[i].ilb_connp; connp != NULL;
 782                     connp = connp->conn_s2c_next) {
 783                         if (connp->conn_l4 == l4 &&
 784                             connp->conn_s2c_dport == dport &&
 785                             connp->conn_s2c_sport == sport &&
 786                             IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
 787                             IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
 788                                 connp->conn_s2c_atime = ddi_get_lbolt64();
 789                                 connp->conn_s2c_pkt_cnt++;
 790                                 *rule_cache = connp->conn_rule_cache;
 791                                 *ip_sum = connp->conn_s2c_ip_sum;
 792                                 *tp_sum = connp->conn_s2c_tp_sum;
 793                                 ret = B_TRUE;
 794                                 break;
 795                         }
 796                 }
 797         }
 798         if (ret) {
 799                 ILB_S_KSTAT(connp->conn_server, pkt_processed);
 800                 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
 801                     pkt_len);
 802 
 803                 switch (l4) {
 804                 case (IPPROTO_TCP):
 805                         tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
 806                             c2s);
 807                         if (!tcp_alive) {
 808                                 connp->conn_gc = B_TRUE;
 809                         }
 810                         break;
 811                 default:
 812                         break;
 813                 }
 814         }
 815         mutex_exit(&hash[i].ilb_conn_hash_lock);
 816 
 817         return (ret);
 818 }
 819 
 820 /*
 821  * To check if a give packet matches an existing conn hash entry.  If it
 822  * does, return the information about this entry so that the caller can
 823  * do the proper NAT.
 824  */
 825 boolean_t
 826 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
 827     in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
 828     uint32_t pkt_len, in6_addr_t *lb_dst)
 829 {
 830         ilb_rule_info_t rule_cache;
 831         uint32_t adj_ip_sum, adj_tp_sum;
 832         boolean_t ret;
 833 
 834         /* Check the incoming hash table. */
 835         if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
 836             &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
 837                 switch (rule_cache.topo) {
 838                 case ILB_TOPO_IMPL_NAT:
 839                         *lb_dst = rule_cache.info.nat_dst;
 840                         ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
 841                             adj_ip_sum, adj_tp_sum, B_TRUE);
 842                         ret = B_TRUE;
 843                         break;
 844                 case ILB_TOPO_IMPL_HALF_NAT:
 845                         *lb_dst = rule_cache.info.nat_dst;
 846                         ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
 847                             adj_ip_sum, adj_tp_sum, B_TRUE);
 848                         ret = B_TRUE;
 849                         break;
 850                 default:
 851                         ret = B_FALSE;
 852                         break;
 853                 }
 854                 return (ret);
 855         }
 856         if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
 857             &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
 858                 switch (rule_cache.topo) {
 859                 case ILB_TOPO_IMPL_NAT:
 860                         *lb_dst = rule_cache.info.src;
 861                         ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
 862                             adj_ip_sum, adj_tp_sum, B_FALSE);
 863                         ret = B_TRUE;
 864                         break;
 865                 case ILB_TOPO_IMPL_HALF_NAT:
 866                         *lb_dst = *dst;
 867                         ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
 868                             adj_ip_sum, adj_tp_sum, B_FALSE);
 869                         ret = B_TRUE;
 870                         break;
 871                 default:
 872                         ret = B_FALSE;
 873                         break;
 874                 }
 875                 return (ret);
 876         }
 877 
 878         return (B_FALSE);
 879 }
 880 
 881 /*
 882  * To check if an ICMP packet belongs to a connection in one of the conn
 883  * hash entries.
 884  */
 885 boolean_t
 886 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
 887     void *icmph, in6_addr_t *lb_dst)
 888 {
 889         ilb_conn_hash_t *hash;
 890         ipha_t *in_iph4;
 891         ip6_t *in_iph6;
 892         icmph_t *icmph4;
 893         icmp6_t *icmph6;
 894         in6_addr_t *in_src_p, *in_dst_p;
 895         in_port_t *sport, *dport;
 896         int l4;
 897         uint_t i;
 898         ilb_conn_t *connp;
 899         ilb_rule_info_t rule_cache;
 900         uint32_t adj_ip_sum;
 901         boolean_t full_nat;
 902 
 903         if (l3 == IPPROTO_IP) {
 904                 in6_addr_t in_src, in_dst;
 905 
 906                 icmph4 = (icmph_t *)icmph;
 907                 in_iph4 = (ipha_t *)&icmph4[1];
 908 
 909                 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
 910                     ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
 911                         return (B_FALSE);
 912                 }
 913 
 914                 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
 915                 in_src_p = &in_src;
 916                 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
 917                 in_dst_p = &in_dst;
 918 
 919                 l4 = in_iph4->ipha_protocol;
 920                 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
 921                         return (B_FALSE);
 922 
 923                 sport = (in_port_t *)((char *)in_iph4 +
 924                     IPH_HDR_LENGTH(in_iph4));
 925                 dport = sport + 1;
 926 
 927                 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
 928                     in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
 929                     ntohs(*sport), uint16_t, ntohs(*dport));
 930         } else {
 931                 ASSERT(l3 == IPPROTO_IPV6);
 932 
 933                 icmph6 = (icmp6_t *)icmph;
 934                 in_iph6 = (ip6_t *)&icmph6[1];
 935                 in_src_p = &in_iph6->ip6_src;
 936                 in_dst_p = &in_iph6->ip6_dst;
 937 
 938                 if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
 939                     ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
 940                         return (B_FALSE);
 941                 }
 942 
 943                 l4 = in_iph6->ip6_nxt;
 944                 /* We don't go deep inside an IPv6 packet yet. */
 945                 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
 946                         return (B_FALSE);
 947 
 948                 sport = (in_port_t *)&in_iph6[1];
 949                 dport = sport + 1;
 950 
 951                 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
 952                     &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
 953                     uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
 954         }
 955 
 956         i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
 957             (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
 958             ilbs->ilbs_conn_hash_size);
 959         hash = ilbs->ilbs_c2s_conn_hash;
 960 
 961         mutex_enter(&hash[i].ilb_conn_hash_lock);
 962         for (connp = hash[i].ilb_connp; connp != NULL;
 963             connp = connp->conn_c2s_next) {
 964                 if (connp->conn_l4 == l4 &&
 965                     connp->conn_c2s_dport == *sport &&
 966                     connp->conn_c2s_sport == *dport &&
 967                     IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
 968                     IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
 969                         connp->conn_c2s_atime = ddi_get_lbolt64();
 970                         connp->conn_c2s_pkt_cnt++;
 971                         rule_cache = connp->conn_rule_cache;
 972                         adj_ip_sum = connp->conn_c2s_ip_sum;
 973                         break;
 974                 }
 975         }
 976         mutex_exit(&hash[i].ilb_conn_hash_lock);
 977 
 978         if (connp == NULL) {
 979                 DTRACE_PROBE(ilb__chk__icmp__conn__failed);
 980                 return (B_FALSE);
 981         }
 982 
 983         switch (rule_cache.topo) {
 984         case ILB_TOPO_IMPL_NAT:
 985                 full_nat = B_TRUE;
 986                 break;
 987         case ILB_TOPO_IMPL_HALF_NAT:
 988                 full_nat = B_FALSE;
 989                 break;
 990         default:
 991                 return (B_FALSE);
 992         }
 993 
 994         *lb_dst = rule_cache.info.nat_dst;
 995         if (l3 == IPPROTO_IP) {
 996                 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
 997                     &rule_cache.info, adj_ip_sum, full_nat);
 998         } else {
 999                 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1000                     &rule_cache.info, full_nat);
1001         }
1002         return (B_TRUE);
1003 }
1004 
1005 /*
1006  * This routine sends up the conn hash table to user land.  Note that the
1007  * request is an ioctl, hence we cannot really differentiate requests
1008  * from different clients.  There is no context shared between different
1009  * ioctls.  Here we make the assumption that the user land ilbd will
1010  * only allow one client to show the conn hash table at any time.
1011  * Otherwise, the results will be "very" inconsistent.
1012  *
1013  * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1014  * to read from the beginning of the able.  After a certain entries
1015  * are reported, the kernel remembers the position of the last returned
1016  * entry.  When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1017  * it will return entries starting from where it was left off.  When
1018  * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1019  * the client that there is no more entry.
1020  *
1021  * It is assumed that the caller has checked the size of nat so that it
1022  * can hold num entries.
1023  */
1024 /* ARGSUSED */
1025 int
1026 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1027     uint32_t *num, uint32_t *flags)
1028 {
1029         ilb_conn_hash_t *hash;
1030         ilb_conn_t *cur_connp;
1031         uint32_t i, j;
1032         int ret = 0;
1033 
1034         mutex_enter(&ilbs->ilbs_conn_list_lock);
1035         while (ilbs->ilbs_conn_list_busy) {
1036                 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1037                     &ilbs->ilbs_conn_list_lock) == 0) {
1038                         mutex_exit(&ilbs->ilbs_conn_list_lock);
1039                         return (EINTR);
1040                 }
1041         }
1042         if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1043                 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1044                 mutex_exit(&ilbs->ilbs_conn_list_lock);
1045                 *num = 0;
1046                 *flags |= ILB_LIST_END;
1047                 return (0);
1048         }
1049         ilbs->ilbs_conn_list_busy = B_TRUE;
1050         mutex_exit(&ilbs->ilbs_conn_list_lock);
1051 
1052         if (*flags & ILB_LIST_BEGIN) {
1053                 i = 0;
1054                 mutex_enter(&hash[0].ilb_conn_hash_lock);
1055                 cur_connp = hash[0].ilb_connp;
1056         } else if (*flags & ILB_LIST_CONT) {
1057                 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1058                         *num = 0;
1059                         *flags |= ILB_LIST_END;
1060                         goto done;
1061                 }
1062                 i = ilbs->ilbs_conn_list_cur;
1063                 mutex_enter(&hash[i].ilb_conn_hash_lock);
1064                 cur_connp = ilbs->ilbs_conn_list_connp;
1065         } else {
1066                 ret = EINVAL;
1067                 goto done;
1068         }
1069 
1070         j = 0;
1071         while (j < *num) {
1072                 if (cur_connp == NULL) {
1073                         mutex_exit(&hash[i].ilb_conn_hash_lock);
1074                         if (++i == ilbs->ilbs_conn_hash_size) {
1075                                 *flags |= ILB_LIST_END;
1076                                 break;
1077                         }
1078                         mutex_enter(&hash[i].ilb_conn_hash_lock);
1079                         cur_connp = hash[i].ilb_connp;
1080                         continue;
1081                 }
1082                 nat[j].proto = cur_connp->conn_l4;
1083 
1084                 nat[j].in_global = cur_connp->conn_c2s_daddr;
1085                 nat[j].in_global_port = cur_connp->conn_c2s_dport;
1086                 nat[j].out_global = cur_connp->conn_c2s_saddr;
1087                 nat[j].out_global_port = cur_connp->conn_c2s_sport;
1088 
1089                 nat[j].in_local = cur_connp->conn_s2c_saddr;
1090                 nat[j].in_local_port = cur_connp->conn_s2c_sport;
1091                 nat[j].out_local = cur_connp->conn_s2c_daddr;
1092                 nat[j].out_local_port = cur_connp->conn_s2c_dport;
1093 
1094                 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1095                 nat[j].last_access_time =
1096                     TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1097 
1098                 /*
1099                  * The conn_s2c_pkt_cnt may not be accurate since we are not
1100                  * holding the s2c hash lock.
1101                  */
1102                 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1103                     cur_connp->conn_s2c_pkt_cnt;
1104                 j++;
1105 
1106                 cur_connp = cur_connp->conn_c2s_next;
1107         }
1108         ilbs->ilbs_conn_list_connp = cur_connp;
1109         if (j == *num)
1110                 mutex_exit(&hash[i].ilb_conn_hash_lock);
1111 
1112         ilbs->ilbs_conn_list_cur = i;
1113 
1114         *num = j;
1115 done:
1116         mutex_enter(&ilbs->ilbs_conn_list_lock);
1117         ilbs->ilbs_conn_list_busy = B_FALSE;
1118         cv_signal(&ilbs->ilbs_conn_list_cv);
1119         mutex_exit(&ilbs->ilbs_conn_list_lock);
1120 
1121         return (ret);
1122 }
1123 
1124 
1125 /*
1126  * Stickiness (persistence) handling routines.
1127  */
1128 
1129 
1130 static void
1131 ilb_sticky_cache_init(void)
1132 {
1133         ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1134             sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1135             ilb_kmem_flags);
1136 }
1137 
1138 void
1139 ilb_sticky_cache_fini(void)
1140 {
1141         if (ilb_sticky_cache != NULL) {
1142                 kmem_cache_destroy(ilb_sticky_cache);
1143                 ilb_sticky_cache = NULL;
1144         }
1145 }
1146 
1147 void
1148 ilb_sticky_refrele(ilb_sticky_t *s)
1149 {
1150         ILB_STICKY_REFRELE(s);
1151 }
1152 
1153 static ilb_sticky_t *
1154 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1155 {
1156         ilb_sticky_t *s;
1157 
1158         ASSERT(mutex_owned(&hash->sticky_lock));
1159 
1160         for (s = list_head(&hash->sticky_head); s != NULL;
1161             s = list_next(&hash->sticky_head, s)) {
1162                 if (s->rule_instance == rule->ir_ks_instance) {
1163                         if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1164                                 return (s);
1165                 }
1166         }
1167         return (NULL);
1168 }
1169 
1170 static ilb_sticky_t *
1171 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1172     in6_addr_t *src)
1173 {
1174         ilb_sticky_t *s;
1175 
1176         ASSERT(mutex_owned(&hash->sticky_lock));
1177 
1178         if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1179                 return (NULL);
1180 
1181         /*
1182          * The rule instance is for handling the scenario when the same
1183          * client talks to different rules at the same time.  Stickiness
1184          * is per rule so we can use the rule instance to differentiate
1185          * the client's request.
1186          */
1187         s->rule_instance = rule->ir_ks_instance;
1188         /*
1189          * Copy the rule name for listing all sticky cache entry.  ir_name
1190          * is guaranteed to be NULL terminated.
1191          */
1192         (void) strcpy(s->rule_name, rule->ir_name);
1193         s->server = server;
1194 
1195         /*
1196          * Grab a ref cnt on the server so that it won't go away while
1197          * it is still in the sticky table.
1198          */
1199         ILB_SERVER_REFHOLD(server);
1200         s->src = *src;
1201         s->expiry = rule->ir_sticky_expiry;
1202         s->refcnt = 1;
1203         s->hash = hash;
1204 
1205         /*
1206          * There is no need to set atime here since the refcnt is not
1207          * zero.  A sticky entry is removed only when the refcnt is
1208          * zero.  But just set it here for debugging purpose.  The
1209          * atime is set when a refrele is done on a sticky entry.
1210          */
1211         s->atime = ddi_get_lbolt64();
1212 
1213         list_insert_head(&hash->sticky_head, s);
1214         hash->sticky_cnt++;
1215         return (s);
1216 }
1217 
1218 /*
1219  * This routine checks if there is an existing sticky entry which matches
1220  * a given packet.  If there is one, return it.  If there is not, create
1221  * a sticky entry using the packet's info.
1222  */
1223 ilb_server_t *
1224 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1225     ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1226 {
1227         int i;
1228         ilb_sticky_hash_t *hash;
1229         ilb_sticky_t *s;
1230 
1231         ASSERT(server != NULL);
1232 
1233         *res = NULL;
1234 
1235         i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1236             (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1237         hash = &ilbs->ilbs_sticky_hash[i];
1238 
1239         /* First check if there is already an entry. */
1240         mutex_enter(&hash->sticky_lock);
1241         s = ilb_sticky_lookup(hash, rule, src);
1242 
1243         /* No sticky entry, add one. */
1244         if (s == NULL) {
1245 add_new_entry:
1246                 s = ilb_sticky_add(hash, rule, server, src);
1247                 if (s == NULL) {
1248                         mutex_exit(&hash->sticky_lock);
1249                         return (NULL);
1250                 }
1251                 /*
1252                  * Find a source for this server.  All subseqent requests from
1253                  * the same client matching this sticky entry will use this
1254                  * source address in doing NAT.  The current algorithm is
1255                  * simple, rotate the source address.  Note that the
1256                  * source address array does not change after it's created, so
1257                  * it is OK to just increment the cur index.
1258                  */
1259                 if (server->iser_nat_src != NULL) {
1260                         /* It is a hint, does not need to be atomic. */
1261                         *src_ent_idx = (server->iser_nat_src->cur++ %
1262                             server->iser_nat_src->num_src);
1263                         s->nat_src_idx = *src_ent_idx;
1264                 }
1265                 mutex_exit(&hash->sticky_lock);
1266                 *res = s;
1267                 return (server);
1268         }
1269 
1270         /*
1271          * We don't hold any lock accessing iser_enabled.  Refer to the
1272          * comment in ilb_server_add() about iser_lock.
1273          */
1274         if (!s->server->iser_enabled) {
1275                 /*
1276                  * s->server == server can only happen if there is a race in
1277                  * toggling the iser_enabled flag (we don't hold a lock doing
1278                  * that) so that the load balance algorithm still returns a
1279                  * disabled server.  In this case, just drop the packet...
1280                  */
1281                 if (s->server == server) {
1282                         mutex_exit(&hash->sticky_lock);
1283                         return (NULL);
1284                 }
1285 
1286                 /*
1287                  * The old server is disabled and there is a new server, use
1288                  * the new one to create a sticky entry.  Since we will
1289                  * add the entry at the beginning, subsequent lookup will
1290                  * find this new entry instead of the old one.
1291                  */
1292                 goto add_new_entry;
1293         }
1294 
1295         s->refcnt++;
1296         *res = s;
1297         mutex_exit(&hash->sticky_lock);
1298         if (server->iser_nat_src != NULL)
1299                 *src_ent_idx = s->nat_src_idx;
1300         return (s->server);
1301 }
1302 
1303 static void
1304 ilb_sticky_cleanup(void *arg)
1305 {
1306         ilb_timer_t *timer = (ilb_timer_t *)arg;
1307         uint32_t i;
1308         ilb_stack_t *ilbs;
1309         ilb_sticky_hash_t *hash;
1310         ilb_sticky_t *s, *nxt_s;
1311         int64_t now, expiry;
1312 
1313         ilbs = timer->ilbs;
1314         hash = ilbs->ilbs_sticky_hash;
1315         ASSERT(hash != NULL);
1316 
1317         now = ddi_get_lbolt64();
1318         for (i = timer->start; i < timer->end; i++) {
1319                 mutex_enter(&hash[i].sticky_lock);
1320                 for (s = list_head(&hash[i].sticky_head); s != NULL;
1321                     s = nxt_s) {
1322                         nxt_s = list_next(&hash[i].sticky_head, s);
1323                         if (s->refcnt != 0)
1324                                 continue;
1325                         expiry = now - SEC_TO_TICK(s->expiry);
1326                         if (s->atime < expiry) {
1327                                 ILB_SERVER_REFRELE(s->server);
1328                                 list_remove(&hash[i].sticky_head, s);
1329                                 kmem_cache_free(ilb_sticky_cache, s);
1330                                 hash[i].sticky_cnt--;
1331                         }
1332                 }
1333                 mutex_exit(&hash[i].sticky_lock);
1334         }
1335 }
1336 
1337 static void
1338 ilb_sticky_timer(void *arg)
1339 {
1340         ilb_timer_t *timer = (ilb_timer_t *)arg;
1341 
1342         (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1343             ilb_sticky_cleanup, arg, TQ_SLEEP);
1344         mutex_enter(&timer->tid_lock);
1345         if (timer->tid == 0) {
1346                 mutex_exit(&timer->tid_lock);
1347         } else {
1348                 timer->tid = timeout(ilb_sticky_timer, arg,
1349                     SEC_TO_TICK(ilb_sticky_timeout));
1350                 mutex_exit(&timer->tid_lock);
1351         }
1352 }
1353 
1354 void
1355 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1356 {
1357         extern pri_t minclsyspri;
1358         int i, part;
1359         char tq_name[TASKQ_NAMELEN];
1360         ilb_timer_t *tm;
1361 
1362         if (ilbs->ilbs_sticky_hash_size & (ilbs->ilbs_sticky_hash_size - 1)) {
1363                 for (i = 0; i < 31; i++) {
1364                         if (ilbs->ilbs_sticky_hash_size < (1 << i))
1365                                 break;
1366                 }
1367                 ilbs->ilbs_sticky_hash_size = 1 << i;
1368         }
1369 
1370         ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1371             ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1372         for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1373                 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1374                     MUTEX_DEFAULT, NULL);
1375                 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1376                     sizeof (ilb_sticky_t),
1377                     offsetof(ilb_sticky_t, list));
1378         }
1379 
1380         if (ilb_sticky_cache == NULL)
1381                 ilb_sticky_cache_init();
1382 
1383         (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1384             (void *)ilbs->ilbs_netstack);
1385         ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1386         ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1387             ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1388             ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1389 
1390         ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1391         ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1392             ilb_sticky_timer_size, KM_SLEEP);
1393         part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1394         for (i = 0; i < ilb_sticky_timer_size; i++) {
1395                 tm = ilbs->ilbs_sticky_timer_list + i;
1396                 tm->start = i * part;
1397                 tm->end = i * part + part;
1398                 if (tm->end > ilbs->ilbs_sticky_hash_size)
1399                         tm->end = ilbs->ilbs_sticky_hash_size;
1400                 tm->ilbs = ilbs;
1401                 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1402                 /* Spread out the starting execution time of all the timers. */
1403                 tm->tid = timeout(ilb_sticky_timer, tm,
1404                     SEC_TO_TICK(ilb_sticky_timeout + i));
1405         }
1406 }
1407 
1408 void
1409 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1410 {
1411         int i;
1412         ilb_sticky_t *s;
1413 
1414         if (ilbs->ilbs_sticky_hash == NULL)
1415                 return;
1416 
1417         /* Stop all the timers first. */
1418         for (i = 0; i < ilb_sticky_timer_size; i++) {
1419                 timeout_id_t tid;
1420 
1421                 /* Setting tid to 0 tells the timer handler not to restart. */
1422                 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1423                 tid = ilbs->ilbs_sticky_timer_list[i].tid;
1424                 ilbs->ilbs_sticky_timer_list[i].tid = 0;
1425                 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1426                 (void) untimeout(tid);
1427         }
1428         kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1429             ilb_sticky_timer_size);
1430         taskq_destroy(ilbs->ilbs_sticky_taskq);
1431         ilbs->ilbs_sticky_taskq = NULL;
1432 
1433         for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1434                 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1435                     != NULL) {
1436                         list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1437                         ILB_SERVER_REFRELE(s->server);
1438                         kmem_free(s, sizeof (ilb_sticky_t));
1439                 }
1440         }
1441         kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1442             sizeof (ilb_sticky_hash_t));
1443 }
1444 
1445 /*
1446  * This routine sends up the sticky hash table to user land.  Refer to
1447  * the comments before ilb_list_nat().  Both routines assume similar
1448  * conditions.
1449  *
1450  * It is assumed that the caller has checked the size of st so that it
1451  * can hold num entries.
1452  */
1453 /* ARGSUSED */
1454 int
1455 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1456     uint32_t *num, uint32_t *flags)
1457 {
1458         ilb_sticky_hash_t *hash;
1459         ilb_sticky_t *curp;
1460         uint32_t i, j;
1461         int ret = 0;
1462 
1463         mutex_enter(&ilbs->ilbs_sticky_list_lock);
1464         while (ilbs->ilbs_sticky_list_busy) {
1465                 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1466                     &ilbs->ilbs_sticky_list_lock) == 0) {
1467                         mutex_exit(&ilbs->ilbs_sticky_list_lock);
1468                         return (EINTR);
1469                 }
1470         }
1471         if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1472                 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1473                 *num = 0;
1474                 *flags |= ILB_LIST_END;
1475                 return (0);
1476         }
1477         ilbs->ilbs_sticky_list_busy = B_TRUE;
1478         mutex_exit(&ilbs->ilbs_sticky_list_lock);
1479 
1480         if (*flags & ILB_LIST_BEGIN) {
1481                 i = 0;
1482                 mutex_enter(&hash[0].sticky_lock);
1483                 curp = list_head(&hash[0].sticky_head);
1484         } else if (*flags & ILB_LIST_CONT) {
1485                 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1486                         *num = 0;
1487                         *flags |= ILB_LIST_END;
1488                         goto done;
1489                 }
1490                 i = ilbs->ilbs_sticky_list_cur;
1491                 mutex_enter(&hash[i].sticky_lock);
1492                 curp = ilbs->ilbs_sticky_list_curp;
1493         } else {
1494                 ret = EINVAL;
1495                 goto done;
1496         }
1497 
1498         j = 0;
1499         while (j < *num) {
1500                 if (curp == NULL) {
1501                         mutex_exit(&hash[i].sticky_lock);
1502                         if (++i == ilbs->ilbs_sticky_hash_size) {
1503                                 *flags |= ILB_LIST_END;
1504                                 break;
1505                         }
1506                         mutex_enter(&hash[i].sticky_lock);
1507                         curp = list_head(&hash[i].sticky_head);
1508                         continue;
1509                 }
1510                 (void) strcpy(st[j].rule_name, curp->rule_name);
1511                 st[j].req_addr = curp->src;
1512                 st[j].srv_addr = curp->server->iser_addr_v6;
1513                 st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1514                 j++;
1515                 curp = list_next(&hash[i].sticky_head, curp);
1516         }
1517         ilbs->ilbs_sticky_list_curp = curp;
1518         if (j == *num)
1519                 mutex_exit(&hash[i].sticky_lock);
1520 
1521         ilbs->ilbs_sticky_list_cur = i;
1522 
1523         *num = j;
1524 done:
1525         mutex_enter(&ilbs->ilbs_sticky_list_lock);
1526         ilbs->ilbs_sticky_list_busy = B_FALSE;
1527         cv_signal(&ilbs->ilbs_sticky_list_cv);
1528         mutex_exit(&ilbs->ilbs_sticky_list_lock);
1529 
1530         return (ret);
1531 }