1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| size_t samples = file.file_size / 4; size_t ir_samples = sizeof(ir) / 4; for (size_t i = 0; i < samples; ++i) { float32x4_t out = { 0 }; for (size_t j = 0; j < ir_samples && i >= j; j += 4) { float32x4_t ir_x4 = vld1q_f32(ir + j); float32x4_t in; switch (i - j) { case 0: { float temp[4] = {file.in_file_data[0], 0, 0, 0}; in = vld1q_f32(temp); break; } case 1: { float temp[4] = {file.in_file_data[1], file.in_file_data[0], 0, 0}; in = vld1q_f32(temp); break; } case 2: { float temp[4] = {file.in_file_data[2], file.in_file_data[1], file.in_file_data[0], 0}; in = vld1q_f32(temp); break; } default: // float temp[4] = {file.in_file_data[i-j], file.in_file_data[i-j-1], file.in_file_data[i-j-2], file.in_file_data[i-j-3]}; // in = vld1q_f32(temp); in = vld1q_f32(file.in_file_data + i - j - 3); in = vrev64q_f32(in); break; } out = vmlaq_f32(out, in, ir_x4); } file.out_file_data[i] += vgetq_lane_f32(out, 0); file.out_file_data[i] += vgetq_lane_f32(out, 1); file.out_file_data[i] += vgetq_lane_f32(out, 2); file.out_file_data[i] += vgetq_lane_f32(out, 3); }
|