attachment:checksumcompare.patch of ZhouyiZHOU

   1 --- sys/i386/i386/in_cksum.c.back	2007-07-17 18:32:52.000000000 +0000
   2 +++ sys/i386/i386/in_cksum.c	2007-07-22 00:36:56.000000000 +0000
   3 @@ -43,7 +43,7 @@
   4  #include <netinet/in.h>
   5  #include <netinet/in_systm.h>
   6  #include <netinet/ip.h>
   7 -
   8 +#include <netinet/ip_var.h>
   9  #include <machine/in_cksum.h>
  10  
  11  /*
  12 @@ -259,7 +259,7 @@
  13   * load the value into a register but will not use it.  Since modern CPUs
  14   * reorder operations, this will generally take place in parallel with
  15   * other calculations.
  16 - */
  17 + *
  18  #define ADD(n)	__asm __volatile \
  19  		("addl %1, %0" : "+r" (sum) : \
  20  		"g" (((const u_int32_t *)w)[n / 4]))
  21 @@ -270,7 +270,11 @@
  22  		("" : : "r" (((const u_int32_t *)w)[n / 4]))
  23  #define MOP	__asm __volatile \
  24  		("adcl         $0, %0" : "+r" (sum))
  25 -
  26 + * remove those macros because gcc will insert add instructions 
  27 + * between __asm expressions
  28 + */
  29 +u_int64_t
  30 +in_cksum_range(struct mbuf *m, int nxt, int offset, int bytes);
  31  u_short
  32  in_cksum_skip(m, len, skip)
  33  	struct mbuf *m;
  34 @@ -279,10 +283,14 @@
  35  {
  36  	register u_short *w;
  37  	register unsigned sum = 0;
  38 +	u_int64_t rv,rv1,rv2;
  39  	register int mlen = 0;
  40  	int byte_swapped = 0;
  41  	union { char	c[2]; u_short	s; } su;
  42  
  43 +	rv2 = in_cksum_range(m, 0, skip, len - skip);
  44 +
  45 +	__asm __volatile("rdtsc": "=A" (rv));	
  46  	len -= skip;
  47  	for (; skip && m; m = m->m_next) {
  48  		if (m->m_len > skip) {
  49 @@ -341,15 +349,25 @@
  50  		 * Advance to a 486 cache line boundary.
  51  		 */
  52  		if (4 & (int) w && mlen >= 4) {
  53 -			ADD(0);
  54 -			MOP;
  55 +		/*	ADD(0);
  56 +			MOP;*/
  57 +		  	__asm volatile ("addl %1, %0 \n"
  58 +					"adcl $0, %0"
  59 +	 				: "+r" (sum) 
  60 +					: "g" (((const unsigned int *)w)[0 / 4]));
  61  			w += 2;
  62  			mlen -= 4;
  63  		}
  64  		if (8 & (int) w && mlen >= 8) {
  65 -			ADD(0);
  66 +		/*	ADD(0);
  67  			ADDC(4);
  68 -			MOP;
  69 +			MOP;*/
  70 +			__asm volatile ("addl %1, %0 \n"
  71 +					"adcl %2, %0 \n"
  72 +					"adcl $0, %0"
  73 +   					: "+r" (sum) : 
  74 +					"g" (((const unsigned int *)w)[0 / 4]),
  75 +   					"g" (((const unsigned int *)w)[4 / 4]));
  76  			w += 4;
  77  			mlen -= 8;
  78  		}
  79 @@ -379,7 +397,7 @@
  80  			 * is initially 33 (not 32) to guaranteed that
  81  			 * the LOAD(32) is within bounds.
  82  			 */
  83 -			ADD(16);
  84 +			/* ADD(16);
  85  			ADDC(0);
  86  			ADDC(4);
  87  			ADDC(8);
  88 @@ -388,12 +406,30 @@
  89  			ADDC(20);
  90  			ADDC(24);
  91  			ADDC(28);
  92 -			MOP;
  93 +			MOP; */
  94 +			__asm volatile ("addl %1, %0 \n"
  95 +					"adcl %2, %0 \n"
  96 +					"adcl %3, %0 \n"
  97 +					"adcl %4, %0 \n"
  98 +					"adcl %5, %0 \n"
  99 +					"adcl %7, %0 \n"
 100 +					"adcl %8, %0 \n"
 101 +					"adcl %9, %0 \n"
 102 +					"adcl $0, %0" : "+r" (sum) :
 103 +					"g" (((const unsigned int *)w)[16 / 4]),
 104 +					"g" (((const unsigned int *)w)[0 / 4]),
 105 +					"g" (((const unsigned int *)w)[4 / 4]),
 106 +					"g" (((const unsigned int *)w)[8 / 4]),
 107 +					"g" (((const unsigned int *)w)[12 / 4]),
 108 +					"r" (((const unsigned int *)w)[32 / 4]),
 109 +					"g" (((const unsigned int *)w)[20 / 4]),
 110 +					"g" (((const unsigned int *)w)[24 / 4]),
 111 +					"g" (((const unsigned int *)w)[28 / 4]));
 112  			w += 16;
 113  		}
 114  		mlen += 32 + 1;
 115  		if (mlen >= 32) {
 116 -			ADD(16);
 117 +			/* ADD(16);
 118  			ADDC(0);
 119  			ADDC(4);
 120  			ADDC(8);
 121 @@ -401,23 +437,54 @@
 122  			ADDC(20);
 123  			ADDC(24);
 124  			ADDC(28);
 125 -			MOP;
 126 +			MOP;*/
 127 +			__asm volatile ("addl %1, %0 \n"
 128 +					"adcl %2, %0 \n"
 129 +					"adcl %3, %0 \n"
 130 +					"adcl %4, %0 \n"
 131 +					"adcl %5, %0 \n"
 132 +					"adcl %6, %0 \n"
 133 +					"adcl %7, %0 \n"
 134 +					"adcl %8, %0 \n"
 135 +					"adcl $0, %0" : "+r" (sum) :
 136 +					"g" (((const unsigned int *)w)[16 / 4]),
 137 +					"g" (((const unsigned int *)w)[0 / 4]),
 138 +					"g" (((const unsigned int *)w)[4 / 4]),
 139 +					"g" (((const unsigned int *)w)[8 / 4]),
 140 +					"g" (((const unsigned int *)w)[12 / 4]),
 141 +					"g" (((const unsigned int *)w)[20 / 4]),
 142 +					"g" (((const unsigned int *)w)[24 / 4]),
 143 +					"g" (((const unsigned int *)w)[28 / 4]));
 144  			w += 16;
 145  			mlen -= 32;
 146  		}
 147  		if (mlen >= 16) {
 148 -			ADD(0);
 149 +			/* ADD(0);
 150  			ADDC(4);
 151  			ADDC(8);
 152  			ADDC(12);
 153 -			MOP;
 154 +			MOP;*/
 155 +			__asm volatile ("addl %1, %0 \n"
 156 +					"adcl %2, %0 \n"
 157 +					"adcl %3, %0 \n"
 158 +					"adcl %4, %0 \n"
 159 +					"adcl $0, %0": "+r"(sum): 
 160 +					"g" (((const unsigned int *)w)[0 / 4]),
 161 +					"g" (((const unsigned int *)w)[4 / 4]),
 162 +					"g" (((const unsigned int *)w)[8 / 4]),
 163 +					"g" (((const unsigned int *)w)[12 / 4]));
 164  			w += 8;
 165  			mlen -= 16;
 166  		}
 167  		if (mlen >= 8) {
 168 -			ADD(0);
 169 +			/* ADD(0);
 170  			ADDC(4);
 171 -			MOP;
 172 +			MOP;*/
 173 +			__asm volatile ("addl %1, %0 \n"
 174 +					"adcl %2, %0 \n"
 175 +					"adcl $0, %0": "+r"(sum): 
 176 +					"g" (((const unsigned int *)w)[0 / 4]),
 177 +					"g" (((const unsigned int *)w)[4 / 4]));
 178  			w += 4;
 179  			mlen -= 8;
 180  		}
 181 @@ -455,6 +522,172 @@
 182  		sum += su.s;
 183  	}
 184  	REDUCE;
 185 +	__asm __volatile("rdtsc": "=A" (rv1));	
 186 +	printf("DFBSD with %16llx cycles while FreeBSD with %16llx cycles\n", rv2, rv1 - rv);
 187  	return (~sum & 0xffff);
 188  }
 189 +__uint32_t asm_ones32(const void *buf, int count);      /* in 32 bit words */
 190 +u_int64_t
 191 +in_cksum_range(struct mbuf *m, int nxt, int offset, int bytes)
 192 +{
 193 +     __uint8_t *ptr;
 194 +     __uint32_t sum0;
 195 +     __uint32_t sum1;
 196 +     int n;
 197 +     int flip;
 198 +     u_int64_t rv,rv1;
 199 +     
 200 +     __asm __volatile("rdtsc": "=A" (rv));	
 201 +     sum0 = 0;
 202 +     sum1 = 0;
 203 +     flip = 0;
 204 + 
 205 +     if (nxt != 0) {
 206 +         uint32_t sum32;
 207 +         struct ipovly ipov;
 208 + 
 209 +         /* pseudo header */
 210 +         if (offset < sizeof(struct ipovly))
 211 +                 panic("in_cksum_range: offset too short");
 212 +         if (m->m_len < sizeof(struct ip))
 213 +                panic("in_cksum_range: bad mbuf chain");
 214 +         bzero(&ipov, sizeof ipov);
 215 +         ipov.ih_len = htons(bytes);
 216 +         ipov.ih_pr = nxt;
 217 +         ipov.ih_src = mtod(m, struct ip *)->ip_src;
 218 +         ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
 219 +         ptr = (uint8_t *)&ipov;
 220 + 
 221 +         sum32 = asm_ones32(ptr, sizeof(ipov) / 4);
 222 +         sum32 = (sum32 >> 16) + (sum32 & 0xffff);
 223 +         if (flip)
 224 +            sum1 += sum32;
 225 +         else
 226 +             sum0 += sum32;
 227 +    }
 228 + 
 229 +    /*
 230 +     * Skip fully engulfed mbufs.  Branch predict optimal.
 231 +     */
 232 +    while (m && offset >= m->m_len) {
 233 +        offset -= m->m_len;
 234 +        m = m->m_next;
 235 +    }
 236 + 
 237 +     /*
 238 +      * Process the checksum for each segment.  Note that the code below is
 239 +      * branch-predict optimal, so it's faster then you might otherwise
 240 +      * believe.  When we are buffer-aligned but also odd-byte-aligned from
 241 +      * the point of view of the IP packet, we accumulate to sum1 instead of
 242 +      * sum0.
 243 +      *
 244 +      * Initial offsets do not pre-set flip (assert that offset is even?)
 245 +      */
 246 +    while (bytes > 0 && m) {
 247 +        /*
 248 +         * Calculate pointer base and number of bytes to snarf, account
 249 +         * for snarfed bytes.
 250 +         */
 251 +        ptr = mtod(m, __uint8_t *) + offset;
 252 +        if ((n = m->m_len - offset) > bytes)
 253 +            n = bytes;
 254 +	    bytes -= n;
 255 + 
 256 +        /*
 257 +         * First 16-bit-align our buffer by eating a byte if necessary,
 258 +         * then 32-bit-align our buffer by eating a word if necessary.
 259 +         *
 260 +         * We are endian-sensitive when chomping a byte.  WARNING!  Be
 261 +         * careful optimizing this!  16 ane 32 bit words must be aligned
 262 +         * for this to be generic code.
 263 +         */
 264 +        if (((intptr_t)ptr & 1) && n) {
 265 +#if BYTE_ORDER == LITTLE_ENDIAN
 266 +           if (flip)
 267 +                 sum1 += ptr[0];
 268 +             else
 269 +                 sum0 += ptr[0];
 270 +#else
 271 +             if (flip)
 272 +                 sum0 += ptr[0];
 273 +             else
 274 +                 sum1 += ptr[0];
 275 +#endif
 276 +             ++ptr;
 277 +             --n;
 278 +             flip = 1 - flip;
 279 +        }
 280 +        if (((intptr_t)ptr & 2) && n > 1) {
 281 +             if (flip)
 282 +                sum1 += *(__uint16_t *)ptr;
 283 +             else
 284 +                sum0 += *(__uint16_t *)ptr;
 285 +             ptr += 2;
 286 +             n -= 2;
 287 +         }
 288 + 
 289 +         /*
 290 +          * Process a 32-bit aligned data buffer and accumulate the result
 291 +          * in sum0 or sum1.  Allow only one 16 bit overflow carry.
 292 +          */
 293 +         if (n >= 4) {
 294 +             __uint32_t sum32;
 295 + 
 296 +             sum32 = asm_ones32((void *)ptr, n >> 2);
 297 +             sum32 = (sum32 >> 16) + (sum32 & 0xffff);
 298 +             if (flip)
 299 +                 sum1 += sum32;
 300 +             else
 301 +                 sum0 += sum32;
 302 +             ptr += n & ~3;
 303 +            /* n &= 3; dontcare */
 304 +         }
 305 + 
 306 +         /*
 307 +          * Handle oddly-sized buffers.  Handle word issues first while
 308 +          * ptr is still aligned.
 309 +          */
 310 +         if (n & 2) {
 311 +             if (flip)
 312 +                 sum1 += *(__uint16_t *)ptr;
 313 +             else
 314 +                 sum0 += *(__uint16_t *)ptr;
 315 +             ptr += 2;
 316 +             /* n -= 2; dontcare */
 317 +         }
 318 +         if (n & 1) {
 319 +#if BYTE_ORDER == LITTLE_ENDIAN
 320 +             if (flip)
 321 +                 sum1 += ptr[0];
 322 +             else
 323 +                 sum0 += ptr[0];
 324 +#else
 325 +             if (flip)
 326 +                 sum0 += ptr[0];
 327 +             else
 328 +                 sum1 += ptr[0];
 329 +#endif
 330 +             /* ++ptr; dontcare */
 331 +             /* --n; dontcare */
 332 +             flip = 1 - flip;
 333 +         }
 334 +         m = m->m_next;
 335 +         offset = 0;
 336 +     }
 337 + 
 338 +     /*
 339 +      * Due to byte aligned or oddly-sized buffers we may have a checksum
 340 +      * in sum1 which needs to be shifted and added to our main sum.  There
 341 +      * is a presumption here that no more then 255 overflows occured which
 342 +      * is 255/3 byte aligned mbufs in the worst case.
 343 +      */
 344 +     sum0 += sum1 << 8;
 345 +     sum0 = (sum0 >> 16) + (sum0 & 0xffff);
 346 +     if (sum0 > 0xffff)
 347 +         ++sum0;
 348 +     __asm __volatile("rdtsc": "=A" (rv1));	
 349 +     return rv1 - rv;
 350 +//     printf("DFBSD with %16llx cycles,",  rv1 - rv);
 351 +//     return(~sum0 & 0xffff);
 352 +}
 353  #endif
 354 --- sys/i386/i386/support.s.back	2007-07-22 00:46:55.000000000 +0000
 355 +++ sys/i386/i386/support.s	2007-07-22 00:47:07.000000000 +0000
 356 @@ -1532,3 +1532,50 @@
 357  	movl	%edx,16(%eax)
 358  	movl	%eax,bbhead
 359  	NON_GPROF_RET
 360 +
 361 +	.text
 362 +         /*
 363 +          * asm_ones32(32bitalignedbuffer, numberof32bitwords)
 364 +          *
 365 +          * Returns the 32 bit one complement partial checksum.  This is
 366 +          * basically a 1's complement checksum without the inversion (~)
 367 +          * at the end.  A 32 bit value is returned.  If the caller is
 368 +          * calculating a 16 bit 1's complement checksum the caller must
 369 +          * collapse the 32 bit return value via:
 370 +          *
 371 +          *      result = (result >> 16) + (result & 0xFFFF)
 372 +          *      if (result > 0xFFFF)
 373 +          *          result -= 0xFFFF ;    <<< same as (result + 1) & 0xFFFF
 374 +          *                                  within the range of result.
 375 +          * Note that worst case 0xFFFFFFFF + 0xFFFFFFFF = 0xFFFFFFFE + CARRY,
 376 +          * so no double-carry ever occurs.
 377 +          */
 378 +         .p2align 4
 379 +ENTRY(asm_ones32)
 380 +         movl    4(%esp),%edx    /* %edx = buffer pointer */
 381 +         movl    8(%esp),%ecx    /* %ecx = counter */
 382 +         subl    %eax,%eax       /* %eax = checksum */
 383 +         cmpl    $5,%ecx
 384 +         jl      2f
 385 +1:
 386 +         subl    $5,%ecx
 387 +         addl    (%edx),%eax
 388 +         adcl    4(%edx),%eax
 389 +         adcl    8(%edx),%eax
 390 +         adcl    12(%edx),%eax
 391 +         adcl    16(%edx),%eax
 392 +         adcl    $0,%eax
 393 +         addl    $20,%edx
 394 +         cmpl    $5,%ecx
 395 +         jge     1b
 396 +2:
 397 +         testl   %ecx,%ecx
 398 +         je      4f
 399 +3:
 400 +         addl    (%edx),%eax
 401 +         adcl    $0,%eax
 402 +         addl    $4,%edx
 403 +         decl    %ecx
 404 +         jnz     3b
 405 +4:
 406 +         ret
 407
Attachment 'checksumcompare.patch'

Attached Files