Attachment 'checksumcompare.patch'
Download 1 --- sys/i386/i386/in_cksum.c.back 2007-07-17 18:32:52.000000000 +0000
2 +++ sys/i386/i386/in_cksum.c 2007-07-22 00:36:56.000000000 +0000
3 @@ -43,7 +43,7 @@
4 #include <netinet/in.h>
5 #include <netinet/in_systm.h>
6 #include <netinet/ip.h>
7 -
8 +#include <netinet/ip_var.h>
9 #include <machine/in_cksum.h>
10
11 /*
12 @@ -259,7 +259,7 @@
13 * load the value into a register but will not use it. Since modern CPUs
14 * reorder operations, this will generally take place in parallel with
15 * other calculations.
16 - */
17 + *
18 #define ADD(n) __asm __volatile \
19 ("addl %1, %0" : "+r" (sum) : \
20 "g" (((const u_int32_t *)w)[n / 4]))
21 @@ -270,7 +270,11 @@
22 ("" : : "r" (((const u_int32_t *)w)[n / 4]))
23 #define MOP __asm __volatile \
24 ("adcl $0, %0" : "+r" (sum))
25 -
26 + * remove those macros because gcc will insert add instructions
27 + * between __asm expressions
28 + */
29 +u_int64_t
30 +in_cksum_range(struct mbuf *m, int nxt, int offset, int bytes);
31 u_short
32 in_cksum_skip(m, len, skip)
33 struct mbuf *m;
34 @@ -279,10 +283,14 @@
35 {
36 register u_short *w;
37 register unsigned sum = 0;
38 + u_int64_t rv,rv1,rv2;
39 register int mlen = 0;
40 int byte_swapped = 0;
41 union { char c[2]; u_short s; } su;
42
43 + rv2 = in_cksum_range(m, 0, skip, len - skip);
44 +
45 + __asm __volatile("rdtsc": "=A" (rv));
46 len -= skip;
47 for (; skip && m; m = m->m_next) {
48 if (m->m_len > skip) {
49 @@ -341,15 +349,25 @@
50 * Advance to a 486 cache line boundary.
51 */
52 if (4 & (int) w && mlen >= 4) {
53 - ADD(0);
54 - MOP;
55 + /* ADD(0);
56 + MOP;*/
57 + __asm volatile ("addl %1, %0 \n"
58 + "adcl $0, %0"
59 + : "+r" (sum)
60 + : "g" (((const unsigned int *)w)[0 / 4]));
61 w += 2;
62 mlen -= 4;
63 }
64 if (8 & (int) w && mlen >= 8) {
65 - ADD(0);
66 + /* ADD(0);
67 ADDC(4);
68 - MOP;
69 + MOP;*/
70 + __asm volatile ("addl %1, %0 \n"
71 + "adcl %2, %0 \n"
72 + "adcl $0, %0"
73 + : "+r" (sum) :
74 + "g" (((const unsigned int *)w)[0 / 4]),
75 + "g" (((const unsigned int *)w)[4 / 4]));
76 w += 4;
77 mlen -= 8;
78 }
79 @@ -379,7 +397,7 @@
80 * is initially 33 (not 32) to guaranteed that
81 * the LOAD(32) is within bounds.
82 */
83 - ADD(16);
84 + /* ADD(16);
85 ADDC(0);
86 ADDC(4);
87 ADDC(8);
88 @@ -388,12 +406,30 @@
89 ADDC(20);
90 ADDC(24);
91 ADDC(28);
92 - MOP;
93 + MOP; */
94 + __asm volatile ("addl %1, %0 \n"
95 + "adcl %2, %0 \n"
96 + "adcl %3, %0 \n"
97 + "adcl %4, %0 \n"
98 + "adcl %5, %0 \n"
99 + "adcl %7, %0 \n"
100 + "adcl %8, %0 \n"
101 + "adcl %9, %0 \n"
102 + "adcl $0, %0" : "+r" (sum) :
103 + "g" (((const unsigned int *)w)[16 / 4]),
104 + "g" (((const unsigned int *)w)[0 / 4]),
105 + "g" (((const unsigned int *)w)[4 / 4]),
106 + "g" (((const unsigned int *)w)[8 / 4]),
107 + "g" (((const unsigned int *)w)[12 / 4]),
108 + "r" (((const unsigned int *)w)[32 / 4]),
109 + "g" (((const unsigned int *)w)[20 / 4]),
110 + "g" (((const unsigned int *)w)[24 / 4]),
111 + "g" (((const unsigned int *)w)[28 / 4]));
112 w += 16;
113 }
114 mlen += 32 + 1;
115 if (mlen >= 32) {
116 - ADD(16);
117 + /* ADD(16);
118 ADDC(0);
119 ADDC(4);
120 ADDC(8);
121 @@ -401,23 +437,54 @@
122 ADDC(20);
123 ADDC(24);
124 ADDC(28);
125 - MOP;
126 + MOP;*/
127 + __asm volatile ("addl %1, %0 \n"
128 + "adcl %2, %0 \n"
129 + "adcl %3, %0 \n"
130 + "adcl %4, %0 \n"
131 + "adcl %5, %0 \n"
132 + "adcl %6, %0 \n"
133 + "adcl %7, %0 \n"
134 + "adcl %8, %0 \n"
135 + "adcl $0, %0" : "+r" (sum) :
136 + "g" (((const unsigned int *)w)[16 / 4]),
137 + "g" (((const unsigned int *)w)[0 / 4]),
138 + "g" (((const unsigned int *)w)[4 / 4]),
139 + "g" (((const unsigned int *)w)[8 / 4]),
140 + "g" (((const unsigned int *)w)[12 / 4]),
141 + "g" (((const unsigned int *)w)[20 / 4]),
142 + "g" (((const unsigned int *)w)[24 / 4]),
143 + "g" (((const unsigned int *)w)[28 / 4]));
144 w += 16;
145 mlen -= 32;
146 }
147 if (mlen >= 16) {
148 - ADD(0);
149 + /* ADD(0);
150 ADDC(4);
151 ADDC(8);
152 ADDC(12);
153 - MOP;
154 + MOP;*/
155 + __asm volatile ("addl %1, %0 \n"
156 + "adcl %2, %0 \n"
157 + "adcl %3, %0 \n"
158 + "adcl %4, %0 \n"
159 + "adcl $0, %0": "+r"(sum):
160 + "g" (((const unsigned int *)w)[0 / 4]),
161 + "g" (((const unsigned int *)w)[4 / 4]),
162 + "g" (((const unsigned int *)w)[8 / 4]),
163 + "g" (((const unsigned int *)w)[12 / 4]));
164 w += 8;
165 mlen -= 16;
166 }
167 if (mlen >= 8) {
168 - ADD(0);
169 + /* ADD(0);
170 ADDC(4);
171 - MOP;
172 + MOP;*/
173 + __asm volatile ("addl %1, %0 \n"
174 + "adcl %2, %0 \n"
175 + "adcl $0, %0": "+r"(sum):
176 + "g" (((const unsigned int *)w)[0 / 4]),
177 + "g" (((const unsigned int *)w)[4 / 4]));
178 w += 4;
179 mlen -= 8;
180 }
181 @@ -455,6 +522,172 @@
182 sum += su.s;
183 }
184 REDUCE;
185 + __asm __volatile("rdtsc": "=A" (rv1));
186 + printf("DFBSD with %16llx cycles while FreeBSD with %16llx cycles\n", rv2, rv1 - rv);
187 return (~sum & 0xffff);
188 }
189 +__uint32_t asm_ones32(const void *buf, int count); /* in 32 bit words */
190 +u_int64_t
191 +in_cksum_range(struct mbuf *m, int nxt, int offset, int bytes)
192 +{
193 + __uint8_t *ptr;
194 + __uint32_t sum0;
195 + __uint32_t sum1;
196 + int n;
197 + int flip;
198 + u_int64_t rv,rv1;
199 +
200 + __asm __volatile("rdtsc": "=A" (rv));
201 + sum0 = 0;
202 + sum1 = 0;
203 + flip = 0;
204 +
205 + if (nxt != 0) {
206 + uint32_t sum32;
207 + struct ipovly ipov;
208 +
209 + /* pseudo header */
210 + if (offset < sizeof(struct ipovly))
211 + panic("in_cksum_range: offset too short");
212 + if (m->m_len < sizeof(struct ip))
213 + panic("in_cksum_range: bad mbuf chain");
214 + bzero(&ipov, sizeof ipov);
215 + ipov.ih_len = htons(bytes);
216 + ipov.ih_pr = nxt;
217 + ipov.ih_src = mtod(m, struct ip *)->ip_src;
218 + ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
219 + ptr = (uint8_t *)&ipov;
220 +
221 + sum32 = asm_ones32(ptr, sizeof(ipov) / 4);
222 + sum32 = (sum32 >> 16) + (sum32 & 0xffff);
223 + if (flip)
224 + sum1 += sum32;
225 + else
226 + sum0 += sum32;
227 + }
228 +
229 + /*
230 + * Skip fully engulfed mbufs. Branch predict optimal.
231 + */
232 + while (m && offset >= m->m_len) {
233 + offset -= m->m_len;
234 + m = m->m_next;
235 + }
236 +
237 + /*
238 + * Process the checksum for each segment. Note that the code below is
239 + * branch-predict optimal, so it's faster then you might otherwise
240 + * believe. When we are buffer-aligned but also odd-byte-aligned from
241 + * the point of view of the IP packet, we accumulate to sum1 instead of
242 + * sum0.
243 + *
244 + * Initial offsets do not pre-set flip (assert that offset is even?)
245 + */
246 + while (bytes > 0 && m) {
247 + /*
248 + * Calculate pointer base and number of bytes to snarf, account
249 + * for snarfed bytes.
250 + */
251 + ptr = mtod(m, __uint8_t *) + offset;
252 + if ((n = m->m_len - offset) > bytes)
253 + n = bytes;
254 + bytes -= n;
255 +
256 + /*
257 + * First 16-bit-align our buffer by eating a byte if necessary,
258 + * then 32-bit-align our buffer by eating a word if necessary.
259 + *
260 + * We are endian-sensitive when chomping a byte. WARNING! Be
261 + * careful optimizing this! 16 ane 32 bit words must be aligned
262 + * for this to be generic code.
263 + */
264 + if (((intptr_t)ptr & 1) && n) {
265 +#if BYTE_ORDER == LITTLE_ENDIAN
266 + if (flip)
267 + sum1 += ptr[0];
268 + else
269 + sum0 += ptr[0];
270 +#else
271 + if (flip)
272 + sum0 += ptr[0];
273 + else
274 + sum1 += ptr[0];
275 +#endif
276 + ++ptr;
277 + --n;
278 + flip = 1 - flip;
279 + }
280 + if (((intptr_t)ptr & 2) && n > 1) {
281 + if (flip)
282 + sum1 += *(__uint16_t *)ptr;
283 + else
284 + sum0 += *(__uint16_t *)ptr;
285 + ptr += 2;
286 + n -= 2;
287 + }
288 +
289 + /*
290 + * Process a 32-bit aligned data buffer and accumulate the result
291 + * in sum0 or sum1. Allow only one 16 bit overflow carry.
292 + */
293 + if (n >= 4) {
294 + __uint32_t sum32;
295 +
296 + sum32 = asm_ones32((void *)ptr, n >> 2);
297 + sum32 = (sum32 >> 16) + (sum32 & 0xffff);
298 + if (flip)
299 + sum1 += sum32;
300 + else
301 + sum0 += sum32;
302 + ptr += n & ~3;
303 + /* n &= 3; dontcare */
304 + }
305 +
306 + /*
307 + * Handle oddly-sized buffers. Handle word issues first while
308 + * ptr is still aligned.
309 + */
310 + if (n & 2) {
311 + if (flip)
312 + sum1 += *(__uint16_t *)ptr;
313 + else
314 + sum0 += *(__uint16_t *)ptr;
315 + ptr += 2;
316 + /* n -= 2; dontcare */
317 + }
318 + if (n & 1) {
319 +#if BYTE_ORDER == LITTLE_ENDIAN
320 + if (flip)
321 + sum1 += ptr[0];
322 + else
323 + sum0 += ptr[0];
324 +#else
325 + if (flip)
326 + sum0 += ptr[0];
327 + else
328 + sum1 += ptr[0];
329 +#endif
330 + /* ++ptr; dontcare */
331 + /* --n; dontcare */
332 + flip = 1 - flip;
333 + }
334 + m = m->m_next;
335 + offset = 0;
336 + }
337 +
338 + /*
339 + * Due to byte aligned or oddly-sized buffers we may have a checksum
340 + * in sum1 which needs to be shifted and added to our main sum. There
341 + * is a presumption here that no more then 255 overflows occured which
342 + * is 255/3 byte aligned mbufs in the worst case.
343 + */
344 + sum0 += sum1 << 8;
345 + sum0 = (sum0 >> 16) + (sum0 & 0xffff);
346 + if (sum0 > 0xffff)
347 + ++sum0;
348 + __asm __volatile("rdtsc": "=A" (rv1));
349 + return rv1 - rv;
350 +// printf("DFBSD with %16llx cycles,", rv1 - rv);
351 +// return(~sum0 & 0xffff);
352 +}
353 #endif
354 --- sys/i386/i386/support.s.back 2007-07-22 00:46:55.000000000 +0000
355 +++ sys/i386/i386/support.s 2007-07-22 00:47:07.000000000 +0000
356 @@ -1532,3 +1532,50 @@
357 movl %edx,16(%eax)
358 movl %eax,bbhead
359 NON_GPROF_RET
360 +
361 + .text
362 + /*
363 + * asm_ones32(32bitalignedbuffer, numberof32bitwords)
364 + *
365 + * Returns the 32 bit one complement partial checksum. This is
366 + * basically a 1's complement checksum without the inversion (~)
367 + * at the end. A 32 bit value is returned. If the caller is
368 + * calculating a 16 bit 1's complement checksum the caller must
369 + * collapse the 32 bit return value via:
370 + *
371 + * result = (result >> 16) + (result & 0xFFFF)
372 + * if (result > 0xFFFF)
373 + * result -= 0xFFFF ; <<< same as (result + 1) & 0xFFFF
374 + * within the range of result.
375 + * Note that worst case 0xFFFFFFFF + 0xFFFFFFFF = 0xFFFFFFFE + CARRY,
376 + * so no double-carry ever occurs.
377 + */
378 + .p2align 4
379 +ENTRY(asm_ones32)
380 + movl 4(%esp),%edx /* %edx = buffer pointer */
381 + movl 8(%esp),%ecx /* %ecx = counter */
382 + subl %eax,%eax /* %eax = checksum */
383 + cmpl $5,%ecx
384 + jl 2f
385 +1:
386 + subl $5,%ecx
387 + addl (%edx),%eax
388 + adcl 4(%edx),%eax
389 + adcl 8(%edx),%eax
390 + adcl 12(%edx),%eax
391 + adcl 16(%edx),%eax
392 + adcl $0,%eax
393 + addl $20,%edx
394 + cmpl $5,%ecx
395 + jge 1b
396 +2:
397 + testl %ecx,%ecx
398 + je 4f
399 +3:
400 + addl (%edx),%eax
401 + adcl $0,%eax
402 + addl $4,%edx
403 + decl %ecx
404 + jnz 3b
405 +4:
406 + ret
407
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.