minor optimizations to precompute_partition_info_sums_()
[flac.git] / src / libFLAC / stream_encoder.c
index 7068103..5b2c12a 100644 (file)
 #include "private/stream_encoder_framing.h"
 #include "private/window.h"
 
+#ifndef FLaC__INLINE
+#define FLaC__INLINE
+#endif
+
 #ifdef min
 #undef min
 #endif
@@ -93,6 +97,7 @@
  */
 #undef ENABLE_RICE_PARAMETER_SEARCH 
 
+
 typedef struct {
        FLAC__int32 *data[FLAC__MAX_CHANNELS];
        unsigned size; /* of each data[] in samples */
@@ -123,13 +128,13 @@ static struct CompressionLevels {
        unsigned max_residual_partition_order;
        unsigned rice_parameter_search_dist;
 } compression_levels_[] = {
-       { false, false,  0, 0, false, false, false, 2, 2, 0 },
-       { true , true ,  0, 0, false, false, false, 2, 2, 0 },
+       { false, false,  0, 0, false, false, false, 0, 3, 0 },
+       { true , true ,  0, 0, false, false, false, 0, 3, 0 },
        { true , false,  0, 0, false, false, false, 0, 3, 0 },
-       { false, false,  6, 0, false, false, false, 3, 3, 0 },
-       { true , true ,  8, 0, false, false, false, 3, 3, 0 },
-       { true , false,  8, 0, false, false, false, 3, 3, 0 },
-       { true , false,  8, 0, false, false, false, 0, 4, 0 },
+       { false, false,  6, 0, false, false, false, 0, 4, 0 },
+       { true , true ,  8, 0, false, false, false, 0, 4, 0 },
+       { true , false,  8, 0, false, false, false, 0, 5, 0 },
+       { true , false,  8, 0, false, false, false, 0, 6, 0 },
        { true , false,  8, 0, false, false, true , 0, 6, 0 },
        { true , false, 12, 0, false, false, true , 0, 6, 0 }
 };
@@ -244,6 +249,7 @@ static unsigned find_best_partition_order_(
        unsigned rice_parameter,
        unsigned min_partition_order,
        unsigned max_partition_order,
+       unsigned bps,
        FLAC__bool do_escape_coding,
        unsigned rice_parameter_search_dist,
        FLAC__EntropyCodingMethod_PartitionedRice *best_partitioned_rice
@@ -255,7 +261,8 @@ static void precompute_partition_info_sums_(
        unsigned residual_samples,
        unsigned predictor_order,
        unsigned min_partition_order,
-       unsigned max_partition_order
+       unsigned max_partition_order,
+       unsigned bps
 );
 
 static void precompute_partition_info_escapes_(
@@ -356,7 +363,7 @@ typedef struct FLAC__StreamEncoderPrivate {
        FLAC__StreamMetadata_SeekTable *seek_table;       /* pointer into encoder->protected_->metadata_ where the seek table is */
        unsigned current_sample_number;
        unsigned current_frame_number;
-       struct FLAC__MD5Context md5context;
+       FLAC__MD5Context md5context;
        FLAC__CPUInfo cpuinfo;
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
        unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
@@ -671,7 +678,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                if(encoder->protected_->max_lpc_order == 0)
                        encoder->protected_->blocksize = 1152;
                else
-                       encoder->protected_->blocksize = 4608;
+                       encoder->protected_->blocksize = 4096;
        }
 
        if(encoder->protected_->blocksize < FLAC__MIN_BLOCK_SIZE || encoder->protected_->blocksize > FLAC__MAX_BLOCK_SIZE)
@@ -734,16 +741,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        encoder->protected_->blocksize != 16384
                )
                        return FLAC__STREAM_ENCODER_INIT_STATUS_NOT_STREAMABLE;
-               if(
-                       encoder->protected_->sample_rate != 8000 &&
-                       encoder->protected_->sample_rate != 16000 &&
-                       encoder->protected_->sample_rate != 22050 &&
-                       encoder->protected_->sample_rate != 24000 &&
-                       encoder->protected_->sample_rate != 32000 &&
-                       encoder->protected_->sample_rate != 44100 &&
-                       encoder->protected_->sample_rate != 48000 &&
-                       encoder->protected_->sample_rate != 96000
-               )
+               if(!FLAC__format_sample_rate_is_subset(encoder->protected_->sample_rate))
                        return FLAC__STREAM_ENCODER_INIT_STATUS_NOT_STREAMABLE;
                if(
                        encoder->protected_->bits_per_sample != 8 &&
@@ -922,7 +920,6 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
 #  ifdef FLAC__CPU_IA32
                FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #   ifdef FLAC__HAS_NASM
-#    ifdef FLAC__SSE_OS
                if(encoder->private_->cpuinfo.data.ia32.sse) {
                        if(encoder->protected_->max_lpc_order < 4)
                                encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;
@@ -933,9 +930,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
                        else
                                encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
                }
-               else
-#    endif /* FLAC__SSE_OS */
-               if(encoder->private_->cpuinfo.data.ia32._3dnow)
+               else if(encoder->private_->cpuinfo.data.ia32._3dnow)
                        encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow;
                else
                        encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
@@ -2515,7 +2510,8 @@ FLAC__bool resize_buffers_(FLAC__StreamEncoder *encoder, unsigned new_blocksize)
                        ok = ok && FLAC__memory_alloc_aligned_int32_array(new_blocksize, &encoder->private_->residual_workspace_mid_side_unaligned[channel][i], &encoder->private_->residual_workspace_mid_side[channel][i]);
                }
        }
-       /* @@@@@@@@@ blocksize*2 is too pessimistic, but to fix, we need smarter logic because a smaller new_blocksize can actually increase the # of partitions */
+       /* the *2 is an approximation to the series 1 + 1/2 + 1/4 + ... that sums tree occupies in a flat array */
+       /*@@@ new_blocksize*2 is too pessimistic, but to fix, we need smarter logic because a smaller new_blocksize can actually increase the # of partitions; would require moving this out into a separate function, then checking its capacity against the need of the current blocksize&min/max_partition_order (and maybe predictor order) */
        ok = ok && FLAC__memory_alloc_aligned_uint64_array(new_blocksize * 2, &encoder->private_->abs_residual_partition_sums_unaligned, &encoder->private_->abs_residual_partition_sums);
        if(encoder->protected_->do_escape_coding)
                ok = ok && FLAC__memory_alloc_aligned_unsigned_array(new_blocksize * 2, &encoder->private_->raw_bits_per_partition_unaligned, &encoder->private_->raw_bits_per_partition);
@@ -3653,7 +3649,7 @@ FLAC__bool add_subframe_(
        return true;
 }
 
-#define SPOTCHECK_ESTIMATE 0 //@@@@@@@@@
+#define SPOTCHECK_ESTIMATE 0
 #if SPOTCHECK_ESTIMATE
 static void spotcheck_subframe_estimate_(
        FLAC__StreamEncoder *encoder,
@@ -3747,6 +3743,7 @@ unsigned evaluate_fixed_subframe_(
                        rice_parameter,
                        min_partition_order,
                        max_partition_order,
+                       subframe_bps,
                        do_escape_coding,
                        rice_parameter_search_dist,
                        &subframe->data.fixed.entropy_coding_method.data.partitioned_rice
@@ -3827,6 +3824,7 @@ unsigned evaluate_lpc_subframe_(
                        rice_parameter,
                        min_partition_order,
                        max_partition_order,
+                       subframe_bps,
                        do_escape_coding,
                        rice_parameter_search_dist,
                        &subframe->data.lpc.entropy_coding_method.data.partitioned_rice
@@ -3884,6 +3882,7 @@ unsigned find_best_partition_order_(
        unsigned rice_parameter,
        unsigned min_partition_order,
        unsigned max_partition_order,
+       unsigned bps,
        FLAC__bool do_escape_coding,
        unsigned rice_parameter_search_dist,
        FLAC__EntropyCodingMethod_PartitionedRice *best_partitioned_rice
@@ -3896,7 +3895,7 @@ unsigned find_best_partition_order_(
        max_partition_order = FLAC__format_get_max_rice_partition_order_from_blocksize_limited_max_and_predictor_order(max_partition_order, blocksize, predictor_order);
        min_partition_order = min(min_partition_order, max_partition_order);
 
-       precompute_partition_info_sums_(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order);
+       precompute_partition_info_sums_(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, bps);
 
        if(do_escape_coding)
                precompute_partition_info_escapes_(residual, raw_bits_per_partition, residual_samples, predictor_order, min_partition_order, max_partition_order);
@@ -3956,56 +3955,54 @@ void precompute_partition_info_sums_(
        unsigned residual_samples,
        unsigned predictor_order,
        unsigned min_partition_order,
-       unsigned max_partition_order
+       unsigned max_partition_order,
+       unsigned bps
 )
 {
        int partition_order;
        unsigned from_partition, to_partition = 0;
        const unsigned blocksize = residual_samples + predictor_order;
+       const unsigned partitions = 1u << max_partition_order;
+       const unsigned default_partition_samples = blocksize >> max_partition_order;
+       unsigned partition, end, residual_sample;
+
+       FLAC__ASSERT(default_partition_samples > predictor_order);
 
        /* first do max_partition_order */
-       for(partition_order = (int)max_partition_order; partition_order >= 0; partition_order--) {
-               FLAC__uint64 abs_residual_partition_sum; /* OPT: can reasonably be FLAC__uint32 for bps <= 17 and maybe higher */
-               unsigned partition, partition_sample, partition_samples, residual_sample;
-               const unsigned partitions = 1u << partition_order;
-               const unsigned default_partition_samples = blocksize >> partition_order;
+       if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) { /* very slightly pessimistic but still catches all common cases */
+               FLAC__uint32 abs_residual_partition_sum;
 
-               FLAC__ASSERT(default_partition_samples > predictor_order);
+               end = (unsigned)(-(int)predictor_order);
+               for(partition = residual_sample = 0; partition < partitions; partition++) {
+                       end += default_partition_samples;
+                       abs_residual_partition_sum = 0;
+                       for( ; residual_sample < end; residual_sample++)
+                               abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
+                       abs_residual_partition_sums[partition] = abs_residual_partition_sum;
+               }
+       }
+       else { /* have to pessimistically use 64 bits for accumulator */
+               FLAC__uint64 abs_residual_partition_sum;
 
+               end = (unsigned)(-(int)predictor_order);
                for(partition = residual_sample = 0; partition < partitions; partition++) {
-                       partition_samples = default_partition_samples;
-                       if(partition == 0)
-                               partition_samples -= predictor_order;
+                       end += default_partition_samples;
                        abs_residual_partition_sum = 0;
-                       for(partition_sample = 0; partition_sample < partition_samples; partition_sample++, residual_sample++) {
-#if defined _MSC_VER && _MSC_VER <= 1200
-                               /* OPT: abs() may be faster for some compilers */
+                       for( ; residual_sample < end; residual_sample++)
                                abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
-#else
-                               const FLAC__int32 r = residual[residual_sample];
-                               if(r < 0)
-                                       abs_residual_partition_sum -= r;
-                               else
-                                       abs_residual_partition_sum += r;
-#endif
-                       }
                        abs_residual_partition_sums[partition] = abs_residual_partition_sum;
                }
-               to_partition = partitions;
-               break;
        }
 
        /* now merge partitions for lower orders */
-       for(from_partition = 0, --partition_order; partition_order >= (int)min_partition_order; partition_order--) {
-               FLAC__uint64 s;
+       for(from_partition = 0, to_partition = partitions, partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
                unsigned i;
                const unsigned partitions = 1u << partition_order;
                for(i = 0; i < partitions; i++) {
-                       s = abs_residual_partition_sums[from_partition];
-                       from_partition++;
-                       abs_residual_partition_sums[to_partition] = s + abs_residual_partition_sums[from_partition];
-                       from_partition++;
-                       to_partition++;
+                       abs_residual_partition_sums[to_partition++] =
+                               abs_residual_partition_sums[from_partition  ] +
+                               abs_residual_partition_sums[from_partition+1];
+                       from_partition += 2;
                }
        }
 }
@@ -4040,6 +4037,7 @@ void precompute_partition_info_escapes_(
                        rmax = 0;
                        for(partition_sample = 0; partition_sample < partition_samples; partition_sample++) {
                                r = residual[residual_sample++];
+                               /* OPT: maybe faster: rmax |= r ^ (r>>31) */
                                if(r < 0)
                                        rmax |= ~r;
                                else
@@ -4067,9 +4065,8 @@ void precompute_partition_info_escapes_(
        }
 }
 
-/*@@@@@@ overflow is a possible problem here for hi-res samples */
 #ifdef EXACT_RICE_BITS_CALCULATION
-static __inline unsigned count_rice_bits_in_partition_(
+static FLaC__INLINE unsigned count_rice_bits_in_partition_(
        const unsigned rice_parameter,
        const unsigned partition_samples,
        const FLAC__int32 *residual
@@ -4084,7 +4081,7 @@ static __inline unsigned count_rice_bits_in_partition_(
        return partition_bits;
 }
 #else
-static __inline unsigned count_rice_bits_in_partition_(
+static FLaC__INLINE unsigned count_rice_bits_in_partition_(
        const unsigned rice_parameter,
        const unsigned partition_samples,
        const FLAC__uint64 abs_residual_partition_sum
@@ -4100,7 +4097,7 @@ static __inline unsigned count_rice_bits_in_partition_(
                )
                - (partition_samples >> 1)
                /* -(partition_samples>>1) to subtract out extra contributions to the abs_residual_partition_sum.
-                * The actual number of bits used is closer to the sum for all i in the partition of  abs(residual[i])>>(rice_parameter-1)
+                * The actual number of bits used is closer to the sum(for all i in the partition) of  abs(residual[i])>>(rice_parameter-1)
                 * By using the abs_residual_partition sum, we also add in bits in the LSBs that would normally be shifted out.
                 * So the subtraction term tries to guess how many extra bits were contributed.
                 * If the LSBs are randomly distributed, this should average to 0.5 extra bits per sample.