Optimize FLAC__bitreader_read_rice_signed_block.
authorMiroslav Lichvar <mlichvar@redhat.com>
Tue, 28 Aug 2012 09:58:41 +0000 (11:58 +0200)
committerErik de Castro Lopo <erikd@mega-nerd.com>
Tue, 28 Aug 2012 11:17:18 +0000 (21:17 +1000)
Signed-off-by: Erik de Castro Lopo <erikd@mega-nerd.com>
src/libFLAC/bitreader.c

index 2660c42..792d6dd 100644 (file)
@@ -711,379 +711,144 @@ FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, unsig
 }
 
 /* this is by far the most heavily used reader call.  it ain't pretty but it's fast */
-/* a lot of the logic is copied, then adapted, from FLAC__bitreader_read_unary_unsigned() and FLAC__bitreader_read_raw_uint32() */
 FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
-/* OPT: possibly faster version for use with MSVC */
-#ifdef _MSC_VER
 {
-       unsigned i;
-       unsigned uval = 0;
-       unsigned bits; /* the # of binary LSBs left to read to finish a rice codeword */
-
        /* try and get br->consumed_words and br->consumed_bits into register;
         * must remember to flush them back to *br before calling other
-        * bitwriter functions that use them, and before returning */
-       register unsigned cwords;
-       register unsigned cbits;
+        * bitreader functions that use them, and before returning */
+       unsigned cwords, words, lsbs, msbs, x, y;
+       unsigned ucbits; /* keep track of the number of unconsumed bits in word */
+       uint32_t b;
+       int *val, *end;
 
        FLAC__ASSERT(0 != br);
        FLAC__ASSERT(0 != br->buffer);
        /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
        FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
        FLAC__ASSERT(parameter < 32);
-       /* the above two asserts also guarantee that the binary part never straddles more that 2 words, so we don't have to loop to read it */
-
-       if(nvals == 0)
-               return true;
-
-       cbits = br->consumed_bits;
-       cwords = br->consumed_words;
+       /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
 
-       while(1) {
+       val = vals;
+       end = vals + nvals;
 
-               /* read unary part */
-               while(1) {
-                       while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-                               uint32_t b = br->buffer[cwords] << cbits;
-                               if(b) {
-#if 0 /* slower, probably due to bad register allocation... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32
-                                       __asm {
-                                               bsr eax, b
-                                               not eax
-                                               and eax, 31
-                                               mov i, eax
-                                       }
-#else
-                                       i = FLAC__clz_uint32(b);
-#endif
-                                       uval += i;
-                                       bits = parameter;
-                                       i++;
-                                       cbits += i;
-                                       if(cbits == FLAC__BITS_PER_WORD) {
-                                               crc16_update_word_(br, br->buffer[cwords]);
-                                               cwords++;
-                                               cbits = 0;
-                                       }
-                                       goto break1;
-                               }
-                               else {
-                                       uval += FLAC__BITS_PER_WORD - cbits;
-                                       crc16_update_word_(br, br->buffer[cwords]);
-                                       cwords++;
-                                       cbits = 0;
-                                       /* didn't find stop bit yet, have to keep going... */
-                               }
-                       }
-                       /* at this point we've eaten up all the whole words; have to try
-                        * reading through any tail bytes before calling the read callback.
-                        * this is a repeat of the above logic adjusted for the fact we
-                        * don't have a whole word.  note though if the client is feeding
-                        * us data a byte at a time (unlikely), br->consumed_bits may not
-                        * be zero.
-                        */
-                       if(br->bytes*8 > cbits) {
-                               const unsigned end = br->bytes * 8;
-                               uint32_t b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << cbits;
-                               if(b) {
-                                       i = FLAC__clz_uint32(b);
-                                       uval += i;
-                                       bits = parameter;
-                                       i++;
-                                       cbits += i;
-                                       FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-                                       goto break1;
-                               }
-                               else {
-                                       uval += end - cbits;
-                                       cbits = end;
-                                       FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-                                       /* didn't find stop bit yet, have to keep going... */
-                               }
-                       }
-                       /* flush registers and read; bitreader_read_from_client_() does
-                        * not touch br->consumed_bits at all but we still need to set
-                        * it in case it fails and we have to return false.
-                        */
-                       br->consumed_bits = cbits;
-                       br->consumed_words = cwords;
-                       if(!bitreader_read_from_client_(br))
+       if(parameter == 0) {
+               while(val < end) {
+                       /* read the unary MSBs and end bit */
+                       if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
                                return false;
-                       cwords = br->consumed_words;
-               }
-break1:
-               /* read binary part */
-               FLAC__ASSERT(cwords <= br->words);
-
-               if(bits) {
-                       while((br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits < bits) {
-                               /* flush registers and read; bitreader_read_from_client_() does
-                                * not touch br->consumed_bits at all but we still need to set
-                                * it in case it fails and we have to return false.
-                                */
-                               br->consumed_bits = cbits;
-                               br->consumed_words = cwords;
-                               if(!bitreader_read_from_client_(br))
-                                       return false;
-                               cwords = br->consumed_words;
-                       }
-                       if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-                               if(cbits) {
-                                       /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-                                       const unsigned n = FLAC__BITS_PER_WORD - cbits;
-                                       const uint32_t word = br->buffer[cwords];
-                                       if(bits < n) {
-                                               uval <<= bits;
-                                               uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-bits);
-                                               cbits += bits;
-                                               goto break2;
-                                       }
-                                       uval <<= n;
-                                       uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
-                                       bits -= n;
-                                       crc16_update_word_(br, word);
-                                       cwords++;
-                                       cbits = 0;
-                                       if(bits) { /* if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
-                                               uval <<= bits;
-                                               uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits));
-                                               cbits = bits;
-                                       }
-                                       goto break2;
-                               }
-                               else {
-                                       FLAC__ASSERT(bits < FLAC__BITS_PER_WORD);
-                                       uval <<= bits;
-                                       uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
-                                       cbits = bits;
-                                       goto break2;
-                               }
-                       }
-                       else {
-                               /* in this case we're starting our read at a partial tail word;
-                                * the reader has guaranteed that we have at least 'bits' bits
-                                * available to read, which makes this case simpler.
-                                */
-                               uval <<= bits;
-                               if(cbits) {
-                                       /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-                                       FLAC__ASSERT(cbits + bits <= br->bytes*8);
-                                       uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-bits);
-                                       cbits += bits;
-                                       goto break2;
-                               }
-                               else {
-                                       uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
-                                       cbits += bits;
-                                       goto break2;
-                               }
-                       }
-               }
-break2:
-               /* compose the value */
-               *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
 
-               /* are we done? */
-               --nvals;
-               if(nvals == 0) {
-                       br->consumed_bits = cbits;
-                       br->consumed_words = cwords;
-                       return true;
+                       *val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
                }
 
-               uval = 0;
-               ++vals;
-
+               return true;
        }
-}
-#else
-{
-       unsigned i;
-       unsigned uval = 0;
 
-       /* try and get br->consumed_words and br->consumed_bits into register;
-        * must remember to flush them back to *br before calling other
-        * bitwriter functions that use them, and before returning */
-       register unsigned cwords;
-       register unsigned cbits;
-       unsigned ucbits; /* keep track of the number of unconsumed bits in the buffer */
+       FLAC__ASSERT(parameter > 0);
 
-       FLAC__ASSERT(0 != br);
-       FLAC__ASSERT(0 != br->buffer);
-       /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
-       FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
-       FLAC__ASSERT(parameter < 32);
-       /* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
+       cwords = br->consumed_words;
+       words = br->words;
 
-       if(nvals == 0)
-               return true;
+       /* if we've not consumed up to a partial tail word... */
+       if(cwords >= words) {
+               x = 0;
+               goto process_tail;
+       }
 
-       cbits = br->consumed_bits;
-       cwords = br->consumed_words;
-       ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+       ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+       b = br->buffer[cwords] << br->consumed_bits;  /* keep unconsumed bits aligned to left */
 
-       while(1) {
+       while(val < end) {
+               /* read the unary MSBs and end bit */
+               x = y = FLAC__clz2_uint32(b);
+               if(x == FLAC__BITS_PER_WORD) {
+                       x = ucbits;
+                       do {
+                               /* didn't find stop bit yet, have to keep going... */
+                               crc16_update_word_(br, br->buffer[cwords++]);
+                               if (cwords >= words)
+                                       goto incomplete_msbs;
+                               b = br->buffer[cwords];
+                               y = FLAC__clz2_uint32(b);
+                               x += y;
+                       } while(y == FLAC__BITS_PER_WORD);
+               }
+               b <<= y;
+               b <<= 1; /* account for stop bit */
+               ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
+               msbs = x;
+
+               /* read the binary LSBs */
+               x = b >> (FLAC__BITS_PER_WORD - parameter);
+               if(parameter <= ucbits) {
+                       ucbits -= parameter;
+                       b <<= parameter;
+               } else {
+                       /* there are still bits left to read, they will all be in the next word */
+                       crc16_update_word_(br, br->buffer[cwords++]);
+                       if (cwords >= words)
+                               goto incomplete_lsbs;
+                       b = br->buffer[cwords];
+                       ucbits += FLAC__BITS_PER_WORD - parameter;
+                       x |= b >> ucbits;
+                       b <<= FLAC__BITS_PER_WORD - ucbits;
+               }
+               lsbs = x;
 
-               /* read unary part */
-               while(1) {
-                       while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-                               uint32_t b = br->buffer[cwords] << cbits;
-                               if(b) {
-#if 0 /* is not discernably faster... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32 && defined __GNUC__
-                                       asm volatile (
-                                               "bsrl %1, %0;"
-                                               "notl %0;"
-                                               "andl $31, %0;"
-                                               : "=r"(i)
-                                               : "r"(b)
-                                       );
-#else
-                                       i = FLAC__clz_uint32(b);
-#endif
-                                       uval += i;
-                                       cbits += i;
-                                       cbits++; /* skip over stop bit */
-                                       if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
-                                               crc16_update_word_(br, br->buffer[cwords]);
-                                               cwords++;
-                                               cbits = 0;
-                                       }
-                                       goto break1;
-                               }
-                               else {
-                                       uval += FLAC__BITS_PER_WORD - cbits;
-                                       crc16_update_word_(br, br->buffer[cwords]);
-                                       cwords++;
-                                       cbits = 0;
-                                       /* didn't find stop bit yet, have to keep going... */
-                               }
-                       }
-                       /* at this point we've eaten up all the whole words; have to try
-                        * reading through any tail bytes before calling the read callback.
-                        * this is a repeat of the above logic adjusted for the fact we
-                        * don't have a whole word.  note though if the client is feeding
-                        * us data a byte at a time (unlikely), br->consumed_bits may not
-                        * be zero.
-                        */
-                       if(br->bytes*8 > cbits) {
-                               const unsigned end = br->bytes * 8;
-                               uint32_t b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
-                               if(b) {
-                                       i = FLAC__clz_uint32(b);
-                                       uval += i;
-                                       cbits += i;
-                                       cbits++; /* skip over stop bit */
-                                       FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-                                       goto break1;
-                               }
-                               else {
-                                       uval += end - cbits;
-                                       cbits = end;
-                                       FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-                                       /* didn't find stop bit yet, have to keep going... */
-                               }
+               /* compose the value */
+               x = (msbs << parameter) | lsbs;
+               *val++ = (int)(x >> 1) ^ -(int)(x & 1);
+
+               continue;
+
+               /* at this point we've eaten up all the whole words */
+process_tail:
+               do {
+                       if(0) {
+incomplete_msbs:
+                               br->consumed_bits = 0;
+                               br->consumed_words = cwords;
                        }
-                       /* flush registers and read; bitreader_read_from_client_() does
-                        * not touch br->consumed_bits at all but we still need to set
-                        * it in case it fails and we have to return false.
-                        */
-                       br->consumed_bits = cbits;
-                       br->consumed_words = cwords;
-                       if(!bitreader_read_from_client_(br))
+
+                       /* read the unary MSBs and end bit */
+                       if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
                                return false;
-                       cwords = br->consumed_words;
-                       ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval;
-                       /* + uval to offset our count by the # of unary bits already
-                        * consumed before the read, because we will add these back
-                        * in all at once at break1
-                        */
-               }
-break1:
-               ucbits -= uval;
-               ucbits--; /* account for stop bit */
-
-               /* read binary part */
-               FLAC__ASSERT(cwords <= br->words);
-
-               if(parameter) {
-                       while(ucbits < parameter) {
-                               /* flush registers and read; bitreader_read_from_client_() does
-                                * not touch br->consumed_bits at all but we still need to set
-                                * it in case it fails and we have to return false.
-                                */
-                               br->consumed_bits = cbits;
+                       msbs += x;
+                       x = ucbits = 0;
+
+                       if(0) {
+incomplete_lsbs:
+                               br->consumed_bits = 0;
                                br->consumed_words = cwords;
-                               if(!bitreader_read_from_client_(br))
-                                       return false;
-                               cwords = br->consumed_words;
-                               ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
-                       }
-                       if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-                               if(cbits) {
-                                       /* this also works when consumed_bits==0, it's just slower than necessary for that case */
-                                       const unsigned n = FLAC__BITS_PER_WORD - cbits;
-                                       const uint32_t word = br->buffer[cwords];
-                                       if(parameter < n) {
-                                               uval <<= parameter;
-                                               uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
-                                               cbits += parameter;
-                                       }
-                                       else {
-                                               uval <<= n;
-                                               uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
-                                               crc16_update_word_(br, word);
-                                               cwords++;
-                                               cbits = parameter - n;
-                                               if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
-                                                       uval <<= cbits;
-                                                       uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
-                                               }
-                                       }
-                               }
-                               else {
-                                       cbits = parameter;
-                                       uval <<= parameter;
-                                       uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
-                               }
                        }
-                       else {
-                               /* in this case we're starting our read at a partial tail word;
-                                * the reader has guaranteed that we have at least 'parameter'
-                                * bits available to read, which makes this case simpler.
-                                */
-                               uval <<= parameter;
-                               if(cbits) {
-                                       /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-                                       FLAC__ASSERT(cbits + parameter <= br->bytes*8);
-                                       uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
-                                       cbits += parameter;
-                               }
-                               else {
-                                       cbits = parameter;
-                                       uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
-                               }
-                       }
-               }
 
-               ucbits -= parameter;
-
-               /* compose the value */
-               *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+                       /* read the binary LSBs */
+                       if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
+                               return false;
+                       lsbs = x | lsbs;
 
-               /* are we done? */
-               --nvals;
-               if(nvals == 0) {
-                       br->consumed_bits = cbits;
-                       br->consumed_words = cwords;
-                       return true;
-               }
+                       /* compose the value */
+                       x = (msbs << parameter) | lsbs;
+                       *val++ = (int)(x >> 1) ^ -(int)(x & 1);
+                       x = 0;
 
-               uval = 0;
-               ++vals;
+                       cwords = br->consumed_words;
+                       words = br->words;
+                       ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+                       b = br->buffer[cwords] << br->consumed_bits;
+               } while(cwords >= words && val < end);
+       }
 
+       if(ucbits == 0 && cwords < words) {
+               /* don't leave the head word with no unconsumed bits */
+               crc16_update_word_(br, br->buffer[cwords++]);
+               ucbits = FLAC__BITS_PER_WORD;
        }
+
+       br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
+       br->consumed_words = cwords;
+
+       return true;
 }
-#endif
 
 #if 0 /* UNUSED */
 FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter)