../src/lowrisc_ibex_ibex_core_0.1/rtl/ibex_alu.sv Cov: 42.1%

   1: // Copyright lowRISC contributors.
   2: // Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
   3: // Licensed under the Apache License, Version 2.0, see LICENSE for details.
   4: // SPDX-License-Identifier: Apache-2.0
   5: 
   6: /**
   7:  * Arithmetic logic unit
   8:  */
   9: module ibex_alu #(
  10:   parameter bit RV32B = 1'b0
  11: ) (
  12:     input  ibex_pkg::alu_op_e operator_i,
  13:     input  logic [31:0]       operand_a_i,
  14:     input  logic [31:0]       operand_b_i,
  15: 
  16:     input  logic              instr_first_cycle_i,
  17: 
  18:     input  logic [32:0]       multdiv_operand_a_i,
  19:     input  logic [32:0]       multdiv_operand_b_i,
  20: 
  21:     input  logic              multdiv_sel_i,
  22: 
  23:     input  logic [31:0]       imd_val_q_i,
  24:     output logic [31:0]       imd_val_d_o,
  25:     output logic              imd_val_we_o,
  26: 
  27:     output logic [31:0]       adder_result_o,
  28:     output logic [33:0]       adder_result_ext_o,
  29: 
  30:     output logic [31:0]       result_o,
  31:     output logic              comparison_result_o,
  32:     output logic              is_equal_result_o
  33: );
  34:   import ibex_pkg::*;
  35: 
  36:   logic [31:0] operand_a_rev;
  37:   logic [32:0] operand_b_neg;
  38: 
  39:   // bit reverse operand_a for left shifts and bit counting
  40:   for (genvar k = 0; k < 32; k++) begin : gen_rev_operand_a
  41:     assign operand_a_rev[k] = operand_a_i[31-k];
  42:   end
  43: 
  44:   ///////////
  45:   // Adder //
  46:   ///////////
  47: 
  48:   logic        adder_op_b_negate;
  49:   logic [32:0] adder_in_a, adder_in_b;
  50:   logic [31:0] adder_result;
  51: 
  52:   always_comb begin
  53:     adder_op_b_negate = 1'b0;
  54:     unique case (operator_i)
  55:       // Adder OPs
  56:       ALU_SUB,
  57: 
  58:       // Comparator OPs
  59:       ALU_EQ,   ALU_NE,
  60:       ALU_GE,   ALU_GEU,
  61:       ALU_LT,   ALU_LTU,
  62:       ALU_SLT,  ALU_SLTU,
  63: 
  64:       // MinMax OPs (RV32B Ops)
  65:       ALU_MIN,  ALU_MINU,
  66:       ALU_MAX,  ALU_MAXU: adder_op_b_negate = 1'b1;
  67: 
  68:       default:;
  69:     endcase
  70:   end
  71: 
  72:   // prepare operand a
  73:   assign adder_in_a    = multdiv_sel_i ? multdiv_operand_a_i : {operand_a_i,1'b1};
  74: 
  75:   // prepare operand b
  76:   assign operand_b_neg = {operand_b_i,1'b0} ^ {33{1'b1}};
  77:   always_comb begin
  78:     unique case(1'b1)
  79:       multdiv_sel_i:     adder_in_b = multdiv_operand_b_i;
  80:       adder_op_b_negate: adder_in_b = operand_b_neg;
  81:       default :          adder_in_b = {operand_b_i, 1'b0};
  82:     endcase
  83:   end
  84: 
  85:   // actual adder
  86:   assign adder_result_ext_o = $unsigned(adder_in_a) + $unsigned(adder_in_b);
  87: 
  88:   assign adder_result       = adder_result_ext_o[32:1];
  89: 
  90:   assign adder_result_o     = adder_result;
  91: 
  92:   ////////////////
  93:   // Comparison //
  94:   ////////////////
  95: 
  96:   logic is_equal;
  97:   logic is_greater_equal;  // handles both signed and unsigned forms
  98:   logic cmp_signed;
  99: 
 100:   always_comb begin
 101:     unique case (operator_i)
 102:       ALU_GE,
 103:       ALU_LT,
 104:       ALU_SLT,
 105:       // RV32B only
 106:       ALU_MIN,
 107:       ALU_MAX: cmp_signed = 1'b1;
 108: 
 109:       default: cmp_signed = 1'b0;
 110:     endcase
 111:   end
 112: 
 113:   assign is_equal = (adder_result == 32'b0);
 114:   assign is_equal_result_o = is_equal;
 115: 
 116:   // Is greater equal
 117:   always_comb begin
 118:     if ((operand_a_i[31] ^ operand_b_i[31]) == 1'b0) begin
 119:       is_greater_equal = (adder_result[31] == 1'b0);
 120:     end else begin
 121:       is_greater_equal = operand_a_i[31] ^ (cmp_signed);
 122:     end
 123:   end
 124: 
 125:   // GTE unsigned:
 126:   // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
 127:   // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
 128:   // (a[31] == 1 && b[31] == 0) => 1
 129:   // (a[31] == 0 && b[31] == 1) => 0
 130: 
 131:   // GTE signed:
 132:   // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
 133:   // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
 134:   // (a[31] == 1 && b[31] == 0) => 0
 135:   // (a[31] == 0 && b[31] == 1) => 1
 136: 
 137:   // generate comparison result
 138:   logic cmp_result;
 139: 
 140:   always_comb begin
 141:     unique case (operator_i)
 142:       ALU_EQ:             cmp_result =  is_equal;
 143:       ALU_NE:             cmp_result = ~is_equal;
 144:       ALU_GE,   ALU_GEU,
 145:       ALU_MAX,  ALU_MAXU: cmp_result = is_greater_equal; // RV32B only
 146:       ALU_LT,   ALU_LTU,
 147:       ALU_MIN,  ALU_MINU, //RV32B only
 148:       ALU_SLT,  ALU_SLTU: cmp_result = ~is_greater_equal;
 149: 
 150:       default: cmp_result = is_equal;
 151:     endcase
 152:   end
 153: 
 154:   assign comparison_result_o = cmp_result;
 155: 
 156:   ///////////
 157:   // Shift //
 158:   ///////////
 159: 
 160:   // The shifter structure consists of a 33-bit shifter: 32-bit operand + 1 bit extension for
 161:   // arithmetic shifts and one-shift support.
 162:   // Rotations and funnel shifts are implemented as multi-cycle instructions.
 163:   // The shifter is also used for single-bit instructions and bit-field place as detailed below.
 164:   //
 165:   // Standard Shifts
 166:   // ===============
 167:   // For standard shift instructions, the direction of the shift is to the right by default. For
 168:   // left shifts, the signal shift_left signal is set. If so, the operand is initially reversed,
 169:   // shifted to the right by the specified amount and shifted back again. For arithmetic- and
 170:   // one-shifts the 33rd bit of the shifter operand can is set accordingly.
 171:   //
 172:   // Multicycle Shifts
 173:   // =================
 174:   //
 175:   // Rotation
 176:   // --------
 177:   // For rotations, the operand signals operand_a_i and operand_b_i are kept constant to rs1 and
 178:   // rs2 respectively.
 179:   //
 180:   // Rotation pseudocode:
 181:   //   shift_amt = rs2 & 31;
 182:   //   multicycle_result = (rs1 >> shift_amt) | (rs1 << (32 - shift_amt));
 183:   //                       ^-- cycle 0 -----^ ^-- cycle 1 --------------^
 184:   //
 185:   // Funnel Shifts
 186:   // -------------
 187:   // For funnel shifs, operand_a_i is tied to rs1 in the first cycle and rs3 in the
 188:   // second cycle. operand_b_i is always tied to rs2. The order of applying the shift amount or
 189:   // its complement is determined by bit [5] of shift_amt.
 190:   //
 191:   // Funnel shift Pseudocode: (fsl)
 192:   //  shift_amt = rs2 & 63;
 193:   //  shift_amt_compl = 32 - shift_amt[4:0]
 194:   //  if (shift_amt >=33):
 195:   //     multicycle_result = (rs1 >> shift_amt_cmpl[4:0]) | (rs3 << shift_amt[4:0]);
 196:   //                         ^-- cycle 0 ---------------^ ^-- cycle 1 ------------^
 197:   //  else if (shift_amt <= 31 && shift_amt > 0):
 198:   //     multicycle_result = (rs1 << shift_amt[4:0]) | (rs3 >> shift_amt_compl[4:0]);
 199:   //                         ^-- cycle 0 ----------^ ^-- cycle 1 -------------------^
 200:   //  For shift_amt == 0, 32, both shift_amt[4:0] and shift_amt_compl[4:0] == '0.
 201:   //  these cases need to be handled separately outside the shifting structure:
 202:   //  else if (shift_amt == 32):
 203:   //     multicycle_result = rs3
 204:   //  else if (shift_amt == 0):
 205:   //     multicycle_result = rs1.
 206:   //
 207:   // Single-Bit Instructions
 208:   // =======================
 209:   // Single bit instructions operate on bit operand_b_i[4:0] of operand_a_i.
 210: 
 211:   // The operations sbset, sbclr and sbinv are implemented by generation of a bit-mask using the
 212:   // shifter structure. This is done by left-shifting the operand 32'h1 by the required amount.
 213:   // The signal shift_sbmode multiplexes the shifter input and sets the signal shift_left.
 214:   // Further processing is taken care of by a separate structure.
 215:   //
 216:   // For sbext, the bit defined by operand_b_i[4:0] is to be returned. This is done by simply
 217:   // shifting operand_a_i to the right by the required amount and returning bit [0] of the result.
 218:   //
 219:   // Bit-Field Place
 220:   // ===============
 221:   // The shifter structure is shared to compute bfp_mask << bfp_off.
 222: 
 223:   logic       shift_left;
 224:   logic       shift_ones;
 225:   logic       shift_arith;
 226:   logic       shift_funnel;
 227:   logic       shift_sbmode;
 228:   logic [5:0] shift_amt;
 229:   logic [5:0] shift_amt_compl; // complementary shift amount (32 - shift_amt)
 230: 
 231:   logic [31:0] shift_result;
 232:   logic [32:0] shift_result_ext;
 233:   logic [31:0] shift_result_rev;
 234: 
 235:   // zbf
 236:   logic bfp_op;
 237:   logic [4:0]  bfp_len;
 238:   logic [4:0]  bfp_off;
 239:   logic [31:0] bfp_mask;
 240:   logic [31:0] bfp_mask_rev;
 241:   logic [31:0] bfp_result;
 242: 
 243:   // bfp: shares the shifter structure to compute bfp_mask << bfp_off
 244:   assign bfp_op = RV32B ? (operator_i == ALU_BFP) : 1'b0;
 245:   assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16
 246:   assign bfp_off = operand_b_i[20:16];
 247:   assign bfp_mask = RV32B ? ~(32'hffff_ffff << bfp_len) : '0;
 248:   for (genvar i=0; i<32; i++) begin : gen_rev_bfp_mask
 249:     assign bfp_mask_rev[i] = bfp_mask[31-i];
 250:   end
 251: 
 252:   assign bfp_result =
 253:       RV32B ? (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0;
 254: 
 255:   // bit shift_amt[5]: word swap bit: only considered for FSL/FSR.
 256:   // if set, reverse operations in first and second cycle.
 257:   assign shift_amt[5] = operand_b_i[5] & shift_funnel;
 258:   assign shift_amt_compl = 32 - operand_b_i[4:0];
 259: 
 260:   always_comb begin
 261:     if (bfp_op) begin
 262:       shift_amt[4:0] = bfp_off ; // length field of bfp control word
 263:     end else begin
 264:       shift_amt[4:0] = instr_first_cycle_i ?
 265:           (operand_b_i[5] && shift_funnel ? shift_amt_compl[4:0] : operand_b_i[4:0]) :
 266:           (operand_b_i[5] && shift_funnel ? operand_b_i[4:0] : shift_amt_compl[4:0]);
 267:     end
 268:   end
 269: 
 270: 
 271:   // single-bit mode: shift
 272:   assign shift_sbmode = RV32B ?
 273:       (operator_i == ALU_SBSET) | (operator_i == ALU_SBCLR) | (operator_i == ALU_SBINV) : 1'b0;
 274: 
 275:   // left shift if this is:
 276:   // * a standard left shift (slo, sll)
 277:   // * a rol in the first cycle
 278:   // * a ror in the second cycle
 279:   // * fsl: without word-swap bit: first cycle, else: second cycle
 280:   // * fsr: without word-swap bit: second cycle, else: first cycle
 281:   // * a single-bit instruction: sbclr, sbset, sbinv (excluding sbext)
 282:   // * bfp: bfp_mask << bfp_off
 283:   always_comb begin
 284:     unique case (operator_i)
 285:       ALU_SLL: shift_left = 1'b1;
 286:       ALU_SLO,
 287:       ALU_BFP: shift_left = RV32B ? 1'b1 : 1'b0;
 288:       ALU_ROL: shift_left = RV32B ? instr_first_cycle_i : 0;
 289:       ALU_ROR: shift_left = RV32B ? ~instr_first_cycle_i : 0;
 290:       ALU_FSL: shift_left =
 291:           RV32B ? (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0;
 292:       ALU_FSR: shift_left =
 293:           RV32B ? (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0;
 294:       default: shift_left = 1'b0;
 295:     endcase
 296:     if (shift_sbmode) begin
 297:       shift_left = 1'b1;
 298:     end
 299:   end
 300: 
 301:   assign shift_arith      = (operator_i == ALU_SRA);
 302:   assign shift_ones       = RV32B ? (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0;
 303:   assign shift_funnel     = RV32B ? (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0;
 304: 
 305:   // shifter structure.
 306:   always_comb begin
 307: 
 308:     // select shifter input
 309:     // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen.
 310:     if (shift_sbmode) begin
 311:       shift_result = 32'h8000_0000; // rev(32'h1)
 312:     end else begin
 313:       unique case (1'b1)
 314:         bfp_op:       shift_result = bfp_mask_rev;
 315:         shift_left:   shift_result = operand_a_rev;
 316:         default:      shift_result = operand_a_i;
 317:       endcase
 318:     end
 319: 
 320: 
 321:     shift_result_ext =
 322:         $signed({shift_ones | (shift_arith & shift_result[31]), shift_result}) >>> shift_amt[4:0];
 323: 
 324:     shift_result = shift_result_ext[31:0];
 325: 
 326:     for (int unsigned i=0; i<32; i++) begin
 327:       shift_result_rev[i] = shift_result[31-i];
 328:     end
 329: 
 330:     shift_result = shift_left ? shift_result_rev : shift_result;
 331: 
 332:   end
 333: 
 334:   ///////////////////
 335:   // Bitwise Logic //
 336:   ///////////////////
 337: 
 338:   logic bwlogic_or;
 339:   logic bwlogic_and;
 340:   logic [31:0] bwlogic_operand_b;
 341:   logic [31:0] bwlogic_or_result;
 342:   logic [31:0] bwlogic_and_result;
 343:   logic [31:0] bwlogic_xor_result;
 344:   logic [31:0] bwlogic_result;
 345: 
 346:   logic bwlogic_op_b_negate;
 347: 
 348:   always_comb begin
 349:     unique case (operator_i)
 350:       // Logic-with-negate OPs (RV32B Ops)
 351:       ALU_XNOR,
 352:       ALU_ORN,
 353:       ALU_ANDN: bwlogic_op_b_negate = RV32B ? 1'b1 : 1'b0;
 354:       ALU_CMIX: bwlogic_op_b_negate = RV32B ? ~instr_first_cycle_i : 1'b0;
 355:       default:  bwlogic_op_b_negate = 1'b0;
 356:     endcase
 357:   end
 358: 
 359:   assign bwlogic_operand_b = bwlogic_op_b_negate ? operand_b_neg[32:1] : operand_b_i;
 360: 
 361:   assign bwlogic_or_result  = operand_a_i | bwlogic_operand_b;
 362:   assign bwlogic_and_result = operand_a_i & bwlogic_operand_b;
 363:   assign bwlogic_xor_result = operand_a_i ^ bwlogic_operand_b;
 364: 
 365:   assign bwlogic_or  = (operator_i == ALU_OR)  | (operator_i == ALU_ORN);
 366:   assign bwlogic_and = (operator_i == ALU_AND) | (operator_i == ALU_ANDN);
 367: 
 368:   always_comb begin
 369:     unique case (1'b1)
 370:       bwlogic_or:  bwlogic_result = bwlogic_or_result;
 371:       bwlogic_and: bwlogic_result = bwlogic_and_result;
 372:       default:     bwlogic_result = bwlogic_xor_result;
 373:     endcase
 374:   end
 375: 
 376:   logic [31:0] shuffle_result;
 377:   logic [31:0] butterfly_result;
 378:   logic [31:0] invbutterfly_result;
 379: 
 380:   logic [31:0] minmax_result;
 381:   logic [5:0]  bitcnt_result;
 382:   logic [31:0] pack_result;
 383:   logic [31:0] sext_result;
 384:   logic [31:0] multicycle_result;
 385:   logic [31:0] singlebit_result;
 386:   logic [31:0] clmul_result;
 387: 
 388:   if (RV32B) begin : g_alu_rvb
 389: 
 390:     /////////////////
 391:     // Bitcounting //
 392:     /////////////////
 393: 
 394:     // The bit-counter structure computes the number of set bits in its operand. Partial results
 395:     // (from left to right) are needed to compute the control masks for computation of bext/bdep
 396:     // by the butterfly network, if implemented.
 397:     // For pcnt, clz and ctz, only the end result is used.
 398: 
 399:     logic        zbe_op;
 400:     logic        bitcnt_ctz;
 401:     logic        bitcnt_clz;
 402:     logic        bitcnt_cz;
 403:     logic [31:0] bitcnt_bits;
 404:     logic [31:0] bitcnt_mask_op;
 405:     logic [31:0] bitcnt_bit_mask;
 406:     logic [ 5:0] bitcnt_partial [32];
 407: 
 408: 
 409:     assign bitcnt_ctz    = operator_i == ALU_CTZ;
 410:     assign bitcnt_clz    = operator_i == ALU_CLZ;
 411:     assign bitcnt_cz     = bitcnt_ctz | bitcnt_clz;
 412:     assign bitcnt_result = bitcnt_partial[31];
 413: 
 414:     // Bit-mask generation for clz and ctz:
 415:     // The bit mask is generated by spreading the lowest-order set bit in the operand to all
 416:     // higher order bits. The resulting mask is inverted to cover the lowest order zeros. In order
 417:     // to create the bit mask for leading zeros, the input operand needs to be reversed.
 418:     assign bitcnt_mask_op = bitcnt_clz ? operand_a_rev : operand_a_i;
 419: 
 420:     always_comb begin
 421:       bitcnt_bit_mask = bitcnt_mask_op;
 422:       bitcnt_bit_mask |= bitcnt_bit_mask << 1;
 423:       bitcnt_bit_mask |= bitcnt_bit_mask << 2;
 424:       bitcnt_bit_mask |= bitcnt_bit_mask << 4;
 425:       bitcnt_bit_mask |= bitcnt_bit_mask << 8;
 426:       bitcnt_bit_mask |= bitcnt_bit_mask << 16;
 427:       bitcnt_bit_mask = ~bitcnt_bit_mask;
 428:     end
 429: 
 430:     always_comb begin
 431:       case(1'b1)
 432:         zbe_op:      bitcnt_bits = operand_b_i;
 433:         bitcnt_cz:   bitcnt_bits = bitcnt_bit_mask & ~bitcnt_mask_op; // clz / ctz
 434:         default:     bitcnt_bits = operand_a_i; // pcnt
 435:       endcase
 436:     end
 437: 
 438:     // The parallel prefix counter is of the structure of a Brent-Kung Adder. In the first
 439:     // log2(width) stages, the sum of the n preceding bit lines is computed for the bit lines at
 440:     // positions 2**n-1 (power-of-two positions) where n denotes the current stage.
 441:     // In stage n=log2(width), the count for position width-1 (the MSB) is finished.
 442:     // For the intermediate values, an inverse adder tree then computes the bit counts for the bit
 443:     // lines at positions
 444:     // m = 2**(n-1) + i*2**(n-2), where i = [1 ... width / 2**(n-1)-1] and n = [log2(width) ... 2].
 445:     // Thus, at every subsequent stage the result of two previously unconnected sub-trees is
 446:     // summed, starting at the node summing bits [width/2-1 : 0] and [3*width/4-1: width/2]
 447:     // and moving to iteratively sum up all the sub-trees.
 448:     // The inverse adder tree thus features log2(width) - 1 stages the first of these stages is a
 449:     // single addition at position 3*width/4 - 1. It does not interfere with the last
 450:     // stage of the primary adder tree. These stages can thus be folded together, resulting in a
 451:     // total of 2*log2(width)-2 stages.
 452:     // For more details refer to R. Brent, H. T. Kung, "A Regular Layout for Parallel Adders",
 453:     // (1982).
 454:     // For a bitline at position p, only bits
 455:     // bitcnt_partial[max(i, such that p % log2(i) == 0)-1 : 0] are needed for generation of the
 456:     // butterfly network control signals. The adders in the intermediate value adder tree thus need
 457:     // not be full 5-bit adders. We leave the optimization to the synthesis tools.
 458:     //
 459:     // Consider the following 8-bit example for illustraton.
 460:     //
 461:     // let bitcnt_bits = 8'babcdefgh.
 462:     //
 463:     //                   a  b  c  d  e  f  g  h
 464:     //                   | /:  | /:  | /:  | /:
 465:     //                   |/ :  |/ :  |/ :  |/ :
 466:     // stage 1:          +  :  +  :  +  :  +  :
 467:     //                   |  : /:  :  |  : /:  :
 468:     //                   |,--+ :  :  |,--+ :  :
 469:     // stage 2:          +  :  :  :  +  :  :  :
 470:     //                   |  :  |  : /:  :  :  :
 471:     //                   |,-----,--+ :  :  :  : ^-primary adder tree
 472:     // stage 3:          +  :  +  :  :  :  :  : -------------------------
 473:     //                   :  | /| /| /| /| /|  : ,-intermediate adder tree
 474:     //                   :  |/ |/ |/ |/ |/ :  :
 475:     // stage 4           :  +  +  +  +  +  :  :
 476:     //                   :  :  :  :  :  :  :  :
 477:     // bitcnt_partial[i] 7  6  5  4  3  2  1  0
 478: 
 479:     always_comb begin
 480:       bitcnt_partial = '{default: '0};
 481:       // stage 1
 482:       for (int unsigned i=1; i<32; i+=2) begin
 483:         bitcnt_partial[i] = {5'h0, bitcnt_bits[i]} + {5'h0, bitcnt_bits[i-1]};
 484:       end
 485:       // stage 2
 486:       for (int unsigned i=3; i<32; i+=4) begin
 487:         bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
 488:       end
 489:       // stage 3
 490:       for (int unsigned i=7; i<32; i+=8) begin
 491:         bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
 492:       end
 493:       // stage 4
 494:       for (int unsigned i=15; i <32; i+=16) begin
 495:         bitcnt_partial[i] = bitcnt_partial[i-8] + bitcnt_partial[i];
 496:       end
 497:       // stage 5
 498:       bitcnt_partial[31] = bitcnt_partial[15] + bitcnt_partial[31];
 499:       // ^- primary adder tree
 500:       // -------------------------------
 501:       // ,-intermediate value adder tree
 502:       bitcnt_partial[23] = bitcnt_partial[15] + bitcnt_partial[23];
 503: 
 504:       // stage 6
 505:       for (int unsigned i=11; i<32; i+=8) begin
 506:         bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
 507:       end
 508: 
 509:       // stage 7
 510:       for (int unsigned i=5; i<32; i+=4) begin
 511:         bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
 512:       end
 513:       // stage 8
 514:       bitcnt_partial[0] = {5'h0, bitcnt_bits[0]};
 515:       for (int unsigned i=2; i<32; i+=2) begin
 516:         bitcnt_partial[i] = bitcnt_partial[i-1] + {5'h0, bitcnt_bits[i]};
 517:       end
 518:     end
 519: 
 520:     ///////////////
 521:     // Butterfly //
 522:     ///////////////
 523: 
 524:     // The butterfly / inverse butterfly network is shared between bext/bdep (zbe)instructions
 525:     // respectively and grev / gorc instructions (zbp).
 526:     // For bdep, the control bits mask of a local left region is generated by
 527:     // the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the number
 528:     // of ones in the deposit bitmask to the right of the segment. n hereby denotes the width
 529:     // of the according segment. The bitmask for a pertaining local right region is equal to the
 530:     // corresponding local left region. Bext uses an analogue inverse process.
 531:     // Consider the following 8-bit example.  For details, see Hilewitz et al. "Fast Bit Gather,
 532:     // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008).
 533: 
 534:     // 8-bit example:  (Hilewitz et al.)
 535:     // Consider the instruction bdep operand_a_i deposit_mask
 536:     // Let operand_a_i = 8'babcd_efgh
 537:     //    deposit_mask = 8'b1010_1101
 538:     //
 539:     // control bitmask for stage 1:
 540:     //  - number of ones in the right half of the deposit bitmask: 3
 541:     //  - width of the segment: 4
 542:     //  - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000
 543:     //
 544:     // control bitmask:   c3 c2  c1 c0  c3 c2  c1 c0
 545:     //                    1  0   0  0   1  0   0  0
 546:     //                    <- L ----->   <- R ----->
 547:     // operand_a_i        a  b   c  d   e  f   g  h
 548:     //                    :\ |   |  |  /:  |   |  |
 549:     //                    : +|---|--|-+ :  |   |  |
 550:     //                    :/ |   |  |  \:  |   |  |
 551:     // stage 1            e  b   c  d   a  f   g  h
 552:     //                             
 553:     // control bitmask:   c3 c2  c3 c2  c1 c0  c1 c0
 554:     //                    1  1   1  1   1  0   1  0
 555:     //                    :\ :\ /: /:   :\ |  /:  |
 556:     //                    : +:-+-:+ :   : +|-+ :  |
 557:     //                    :/ :/ \: \:   :/ |  \:  |
 558:     // stage 2            c  d   e  b   g  f   a  h
 559:     //                    L  R   L  R   L  R   L  R
 560:     // control bitmask:   c3 c3  c2 c2  c1 c1  c0 c0
 561:     //                    1  1   0  0   1  1   0  0
 562:     //                    :\/:   |  |   :\/:   |  |
 563:     //                    :  :   |  |   :  :   |  |
 564:     //                    :/\:   |  |   :/\:   |  |
 565:     // stage 3            d  c   e  b   f  g   a  h
 566:     // & deposit bitmask: 1  0   1  0   1  1   0  1
 567:     // result:            d  0   e  0   f  g   0  h
 568: 
 569:     assign zbe_op = (operator_i == ALU_BEXT) | (operator_i == ALU_BDEP);
 570: 
 571:     logic [31:0] butterfly_mask_l[5];
 572:     logic [31:0] butterfly_mask_r[5];
 573:     logic [31:0] butterfly_mask_not[5];
 574:     logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap
 575: 
 576:     // bext / bdep
 577:     logic [31:0] butterfly_zbe_mask_l[5];
 578:     logic [31:0] butterfly_zbe_mask_r[5];
 579:     logic [31:0] butterfly_zbe_mask_not[5];
 580: 
 581:     // grev / gorc
 582:     logic [31:0] butterfly_zbp_mask_l[5];
 583:     logic [31:0] butterfly_zbp_mask_r[5];
 584:     logic [31:0] butterfly_zbp_mask_not[5];
 585: 
 586:     logic grev_op;
 587:     logic gorc_op;
 588:     logic zbp_op;
 589: 
 590:     // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage
 591:     `define _N(stg) (16 >> stg)
 592: 
 593:     // bext / bdep control bit generation
 594:     for (genvar stg=0; stg<5; stg++) begin
 595:       // number of segs: 2** stg
 596:       for (genvar seg=0; seg<2**stg; seg++) begin
 597: 
 598:         assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] =
 599:             {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} <<
 600:                 bitcnt_partial[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0];
 601: 
 602:         assign butterfly_zbe_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]
 603:                      = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
 604: 
 605:         assign butterfly_zbe_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]
 606:                      = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
 607: 
 608:         assign butterfly_zbe_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]   = '0;
 609:         assign butterfly_zbe_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0;
 610:       end
 611:     end
 612:     `undef _N
 613: 
 614:     for (genvar stg=0; stg<5; stg++) begin
 615:       assign butterfly_zbe_mask_not[stg] =
 616:           ~(butterfly_zbe_mask_l[stg] | butterfly_zbe_mask_r[stg]);
 617:     end
 618: 
 619:     // grev / gorc control bit generation
 620:     assign butterfly_zbp_mask_l[0] = shift_amt[4] ? 32'hffff_0000 : 32'h0000_0000;
 621:     assign butterfly_zbp_mask_r[0] = shift_amt[4] ? 32'h0000_ffff : 32'h0000_0000;
 622:     assign butterfly_zbp_mask_not[0] =
 623:        !shift_amt[4] || (shift_amt[4] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
 624: 
 625:     assign butterfly_zbp_mask_l[1] = shift_amt[3] ? 32'hff00_ff00 : 32'h0000_0000;
 626:     assign butterfly_zbp_mask_r[1] = shift_amt[3] ? 32'h00ff_00ff : 32'h0000_0000;
 627:     assign butterfly_zbp_mask_not[1] =
 628:        !shift_amt[3] || (shift_amt[3] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
 629: 
 630:     assign butterfly_zbp_mask_l[2] = shift_amt[2] ? 32'hf0f0_f0f0 : 32'h0000_0000;
 631:     assign butterfly_zbp_mask_r[2] = shift_amt[2] ? 32'h0f0f_0f0f : 32'h0000_0000;
 632:     assign butterfly_zbp_mask_not[2] =
 633:        !shift_amt[2] || (shift_amt[2] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
 634: 
 635:     assign butterfly_zbp_mask_l[3] = shift_amt[1] ? 32'hcccc_cccc : 32'h0000_0000;
 636:     assign butterfly_zbp_mask_r[3] = shift_amt[1] ? 32'h3333_3333 : 32'h0000_0000;
 637:     assign butterfly_zbp_mask_not[3] =
 638:        !shift_amt[1] || (shift_amt[1] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
 639: 
 640:     assign butterfly_zbp_mask_l[4] = shift_amt[0] ? 32'haaaa_aaaa : 32'h0000_0000;
 641:     assign butterfly_zbp_mask_r[4] = shift_amt[0] ? 32'h5555_5555 : 32'h0000_0000;
 642:     assign butterfly_zbp_mask_not[4] =
 643:        !shift_amt[0] || (shift_amt[0] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
 644: 
 645:     // grev / gorc instructions
 646:     assign grev_op = RV32B ? (operator_i == ALU_GREV) : 1'b0;
 647:     assign gorc_op = RV32B ? (operator_i == ALU_GORC) : 1'b0;
 648:     assign zbp_op = grev_op | gorc_op;
 649: 
 650:     // select set of masks:
 651:     assign butterfly_mask_l   = zbp_op ? butterfly_zbp_mask_l   : butterfly_zbe_mask_l;
 652:     assign butterfly_mask_r   = zbp_op ? butterfly_zbp_mask_r   : butterfly_zbe_mask_r;
 653:     assign butterfly_mask_not = zbp_op ? butterfly_zbp_mask_not : butterfly_zbe_mask_not;
 654: 
 655:     always_comb begin
 656:       butterfly_result = operand_a_i;
 657: 
 658:       butterfly_result = butterfly_result & butterfly_mask_not[0] |
 659:           ((butterfly_result & butterfly_mask_l[0]) >> 16)|
 660:           ((butterfly_result & butterfly_mask_r[0]) << 16);
 661: 
 662:       butterfly_result = butterfly_result & butterfly_mask_not[1] |
 663:           ((butterfly_result & butterfly_mask_l[1]) >> 8)|
 664:           ((butterfly_result & butterfly_mask_r[1]) << 8);
 665: 
 666:       butterfly_result = butterfly_result & butterfly_mask_not[2] |
 667:           ((butterfly_result & butterfly_mask_l[2]) >> 4)|
 668:           ((butterfly_result & butterfly_mask_r[2]) << 4);
 669: 
 670:       butterfly_result = butterfly_result & butterfly_mask_not[3] |
 671:           ((butterfly_result & butterfly_mask_l[3]) >> 2)|
 672:           ((butterfly_result & butterfly_mask_r[3]) << 2);
 673: 
 674:       butterfly_result = butterfly_result & butterfly_mask_not[4] |
 675:           ((butterfly_result & butterfly_mask_l[4]) >> 1)|
 676:           ((butterfly_result & butterfly_mask_r[4]) << 1);
 677: 
 678:       if (!zbp_op) begin
 679:         butterfly_result = butterfly_result & operand_b_i;
 680:       end
 681:     end
 682: 
 683:     always_comb begin
 684:       invbutterfly_result = operand_a_i & operand_b_i;
 685: 
 686:       invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] |
 687:           ((invbutterfly_result & butterfly_mask_l[4]) >> 1)|
 688:           ((invbutterfly_result & butterfly_mask_r[4]) << 1);
 689: 
 690:       invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] |
 691:           ((invbutterfly_result & butterfly_mask_l[3]) >> 2)|
 692:           ((invbutterfly_result & butterfly_mask_r[3]) << 2);
 693: 
 694:       invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] |
 695:           ((invbutterfly_result & butterfly_mask_l[2]) >> 4)|
 696:           ((invbutterfly_result & butterfly_mask_r[2]) << 4);
 697: 
 698:       invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] |
 699:           ((invbutterfly_result & butterfly_mask_l[1]) >> 8)|
 700:           ((invbutterfly_result & butterfly_mask_r[1]) << 8);
 701: 
 702:       invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] |
 703:           ((invbutterfly_result & butterfly_mask_l[0]) >> 16)|
 704:           ((invbutterfly_result & butterfly_mask_r[0]) << 16);
 705:     end
 706: 
 707:     /////////////////////////
 708:     // Shuffle / Unshuffle //
 709:     /////////////////////////
 710: 
 711:     localparam logic [31:0] SHUFFLE_MASK_L [0:3] =
 712:         '{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444};
 713:     localparam logic [31:0] SHUFFLE_MASK_R [0:3] =
 714:         '{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222};
 715: 
 716:     localparam logic [31:0] FLIP_MASK_L [0:3] =
 717:         '{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000};
 718:     localparam logic [31:0] FLIP_MASK_R [0:3] =
 719:         '{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088};
 720: 
 721:     logic [31:0] SHUFFLE_MASK_NOT [0:3];
 722:     for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not
 723:       assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]);
 724:     end
 725: 
 726:     logic shuffle_flip;
 727:     assign shuffle_flip = operator_i == ALU_UNSHFL;
 728: 
 729:     logic [3:0] shuffle_mode;
 730: 
 731:     always_comb begin
 732:       shuffle_result = operand_a_i;
 733: 
 734:       if (shuffle_flip) begin
 735:         shuffle_mode[3] = shift_amt[0];
 736:         shuffle_mode[2] = shift_amt[1];
 737:         shuffle_mode[1] = shift_amt[2];
 738:         shuffle_mode[0] = shift_amt[3];
 739:       end else begin
 740:         shuffle_mode = shift_amt[3:0];
 741:       end
 742: 
 743:       if (shuffle_flip) begin
 744:         shuffle_result = (shuffle_result & 32'h8822_4411) |
 745:             ((shuffle_result << 6)  & FLIP_MASK_L[0]) | ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
 746:             ((shuffle_result << 9)  & FLIP_MASK_L[1]) | ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
 747:             ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
 748:             ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]);
 749:       end
 750: 
 751:       if (shuffle_mode[3]) begin
 752:         shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) |
 753:             (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) |
 754:             ((shuffle_result >> 8) & SHUFFLE_MASK_R[0]));
 755:       end
 756:       if (shuffle_mode[2]) begin
 757:         shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) |
 758:             (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) |
 759:             ((shuffle_result >> 4) & SHUFFLE_MASK_R[1]));
 760:       end
 761:       if (shuffle_mode[1]) begin
 762:         shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) |
 763:             (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) |
 764:             ((shuffle_result >> 2) & SHUFFLE_MASK_R[2]));
 765:       end
 766:       if (shuffle_mode[0]) begin
 767:         shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) |
 768:             (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) |
 769:             ((shuffle_result >> 1) & SHUFFLE_MASK_R[3]));
 770:       end
 771: 
 772:       if (shuffle_flip) begin
 773:         shuffle_result = (shuffle_result & 32'h8822_4411) |
 774:             ((shuffle_result << 6)  & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0])  |
 775:             ((shuffle_result << 9)  & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1])  |
 776:             ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
 777:             ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]);
 778:       end
 779: 
 780:     end
 781:     ///////////////////////////////////////////////////
 782:     // Carry-less Multiply + Cyclic Redundancy Check //
 783:     ///////////////////////////////////////////////////
 784: 
 785:     // Carry-less multiplication can be understood as multiplication based on
 786:     // the addition interpreted as the bit-wise xor operation.
 787:     //
 788:     // Example: 1101 X 1011 = 1111111:
 789:     //
 790:     //       1011 X 1101
 791:     //       -----------
 792:     //              1101
 793:     //         xor 1101
 794:     //         ---------
 795:     //             10111
 796:     //        xor 0000
 797:     //        ----------
 798:     //            010111
 799:     //       xor 1101
 800:     //       -----------
 801:     //           1111111
 802:     //
 803:     // Architectural details:
 804:     //         A 32 x 32-bit array
 805:     //         [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
 806:     //         is generated. The entries of the array are pairwise 'xor-ed'
 807:     //         together in a 5-stage binary tree.
 808:     //
 809:     //
 810:     // Cyclic Redundancy Check:
 811:     //
 812:     // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
 813:     // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
 814:     // see http://reveng.sourceforge.net/crc-catalogue/all.htm
 815:     // A useful guide to crc arithmetic and algorithms is given here:
 816:     // http://www.piclist.com/techref/method/math/crcguide.html.
 817:     //
 818:     // The CRC operation solves the following equation using binary polynomial arithmetic:
 819:     //
 820:     // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
 821:     //
 822:     // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
 823:     // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
 824:     //
 825:     // Using barret reduction, one can show that
 826:     //
 827:     // M(x) mod P(x) = R(x) =
 828:     //          (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
 829:     //
 830:     // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
 831:     // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
 832:     // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
 833:     //
 834:     // rd = rev( (rev(rs1) << n)  ^ ((rev(rs1) >> (32-n)) cx mu cx P)
 835:     //    = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
 836:     //                       ^-- cycle 0--------------------^
 837:     //      ^- cycle 1 -------------------------------------------^
 838:     //
 839:     // In the last step we used the fact that carry-less multiplication is bit-order agnostic:
 840:     // rev(a cx b) = rev(a) cx rev(b).
 841: 
 842:     logic clmul_rmode;
 843:     logic clmul_hmode;
 844:     logic [31:0] clmul_op_a;
 845:     logic [31:0] clmul_op_b;
 846:     logic [31:0] operand_b_rev;
 847:     logic [31:0] clmul_and_stage[32];
 848:     logic [31:0] clmul_xor_stage1[16];
 849:     logic [31:0] clmul_xor_stage2[8];
 850:     logic [31:0] clmul_xor_stage3[4];
 851:     logic [31:0] clmul_xor_stage4[2];
 852: 
 853:     logic [31:0] clmul_result_raw;
 854:     logic [31:0] clmul_result_rev;
 855: 
 856:     for (genvar i=0; i<32; i++) begin: gen_rev_operand_b
 857:       assign operand_b_rev[i] = operand_b_i[31-i];
 858:     end
 859: 
 860:     assign clmul_rmode = operator_i == ALU_CLMULR;
 861:     assign clmul_hmode = operator_i == ALU_CLMULH;
 862: 
 863:     // CRC
 864:     localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
 865:     localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;
 866: 
 867:     localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
 868:     localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;
 869: 
 870:     logic crc_op;
 871:     logic crc_hmode;
 872:     logic crc_bmode;
 873: 
 874:     logic crc_cpoly;
 875: 
 876:     logic [31:0] crc_operand;
 877:     logic [31:0] crc_poly;
 878:     logic [31:0] crc_mu_rev;
 879: 
 880:     assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
 881:                     (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
 882:                     (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);
 883: 
 884:     assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
 885:                        (operator_i == ALU_CRC32C_H) |
 886:                        (operator_i == ALU_CRC32C_B);
 887: 
 888:     assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
 889:     assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);
 890: 
 891:     assign crc_poly   = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
 892:     assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;
 893: 
 894:     always_comb begin
 895:       unique case(1'b1)
 896:         crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
 897:         crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
 898:         default:   crc_operand = operand_a_i;
 899:       endcase
 900:     end
 901: 
 902:     // Select clmul input
 903:     always_comb begin
 904:       if (crc_op) begin
 905:         clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i;
 906:         clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
 907:       end else begin
 908:         clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
 909:         clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
 910:       end
 911:     end
 912: 
 913:     for (genvar i=0; i<32; i++) begin : gen_clmul_and_op
 914:       assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
 915:     end
 916: 
 917:     for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1
 918:       assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
 919:     end
 920: 
 921:     for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2
 922:       assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
 923:     end
 924: 
 925:     for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3
 926:       assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
 927:     end
 928: 
 929:     for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4
 930:       assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
 931:     end
 932: 
 933:     assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
 934: 
 935:     for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result
 936:       assign clmul_result_rev[i] = clmul_result_raw[31-i];
 937:     end
 938: 
 939:     // clmulr_result = rev(clmul(rev(a), rev(b)))
 940:     // clmulh_result = clmulr_result >> 1
 941:     always_comb begin
 942:       case(1'b1)
 943:         clmul_rmode: clmul_result = clmul_result_rev;
 944:         clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
 945:         default:     clmul_result = clmul_result_raw;
 946:       endcase
 947:     end
 948: 
 949:     //////////////////////////////////////
 950:     // Multicycle Bitmanip Instructions //
 951:     //////////////////////////////////////
 952:     // Ternary instructions + Shift Rotations + CRC
 953:     // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
 954:     // second cycle. operand_b_i is always tied to rs2.
 955: 
 956: 
 957:     always_comb begin
 958:       unique case (operator_i)
 959:         ALU_CMOV: begin
 960:             imd_val_d_o = operand_a_i;
 961:             multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i;
 962:           if (instr_first_cycle_i) begin
 963:             imd_val_we_o = 1'b1;
 964:           end else begin
 965:             imd_val_we_o = 1'b0;
 966:           end
 967:         end
 968: 
 969:         ALU_CMIX: begin
 970:           multicycle_result = imd_val_q_i | bwlogic_and_result;
 971:           imd_val_d_o = bwlogic_and_result;
 972:           if (instr_first_cycle_i) begin
 973:             imd_val_we_o = 1'b1;
 974:           end else begin
 975:             imd_val_we_o = 1'b0;
 976:           end
 977:         end
 978: 
 979:         ALU_FSR, ALU_FSL,
 980:         ALU_ROL, ALU_ROR: begin
 981:           if (shift_amt[4:0] == 5'h0) begin
 982:             multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i;
 983:           end else begin
 984:             multicycle_result = imd_val_q_i | shift_result;
 985:           end
 986:           imd_val_d_o = shift_result;
 987:           if (instr_first_cycle_i) begin
 988:             imd_val_we_o = 1'b1;
 989:           end else begin
 990:             imd_val_we_o = 1'b0;
 991:           end
 992:         end
 993: 
 994:         ALU_CRC32_W, ALU_CRC32C_W,
 995:         ALU_CRC32_H, ALU_CRC32C_H,
 996:         ALU_CRC32_B, ALU_CRC32C_B: begin
 997:           imd_val_d_o = clmul_result_rev;
 998:           unique case(1'b1)
 999:             crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
1000:             crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
1001:             default:   multicycle_result = clmul_result_rev;
1002:           endcase
1003:           if (instr_first_cycle_i) begin
1004:             imd_val_we_o = 1'b1;
1005:           end else begin
1006:             imd_val_we_o = 1'b0;
1007:           end
1008:         end
1009: 
1010:         default: begin
1011:           imd_val_d_o = operand_a_i;
1012:           imd_val_we_o = 1'b0;
1013:           multicycle_result = operand_a_i;
1014:         end
1015:       endcase
1016:     end
1017: 
1018:     /////////////////////////////
1019:     // Single-bit Instructions //
1020:     /////////////////////////////
1021: 
1022:     always_comb begin
1023:       unique case (operator_i)
1024:         ALU_SBSET: singlebit_result = operand_a_i | shift_result;
1025:         ALU_SBCLR: singlebit_result = operand_a_i & ~shift_result;
1026:         ALU_SBINV: singlebit_result = operand_a_i ^ shift_result;
1027:         default:   singlebit_result = {31'h0, shift_result[0]}; // ALU_SBEXT
1028:       endcase
1029:     end
1030: 
1031:     ///////////////
1032:     // Min / Max //
1033:     ///////////////
1034: 
1035:     assign minmax_result = cmp_result ? operand_a_i : operand_b_i;
1036: 
1037: 
1038:     //////////
1039:     // Pack //
1040:     //////////
1041: 
1042:     logic packu;
1043:     logic packh;
1044:     assign packu = operator_i == ALU_PACKU;
1045:     assign packh = operator_i == ALU_PACKH;
1046: 
1047:     always_comb begin
1048:       unique case (1'b1)
1049:         packu:   pack_result = {operand_b_i[31:16], operand_a_i[31:16]};
1050:         packh:   pack_result = {16'h0, operand_b_i[7:0], operand_a_i[7:0]};
1051:         default: pack_result = {operand_b_i[15:0], operand_a_i[15:0]};
1052:       endcase
1053:     end
1054: 
1055:     //////////
1056:     // Sext //
1057:     //////////
1058: 
1059:     assign sext_result = (operator_i == ALU_SEXTB) ?
1060:         { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]};
1061: 
1062:   end else begin : g_no_alu_rvb
1063:     // RV32B result signals
1064:     assign minmax_result       = '0;
1065:     assign bitcnt_result       = '0;
1066:     assign pack_result         = '0;
1067:     assign sext_result         = '0;
1068:     assign multicycle_result   = '0;
1069:     assign singlebit_result    = '0;
1070:     assign shuffle_result      = '0;
1071:     assign butterfly_result    = '0;
1072:     assign invbutterfly_result = '0;
1073:     assign clmul_result        = '0;
1074:     // RV32B support signals
1075:     assign imd_val_d_o         = '0;
1076:     assign imd_val_we_o        = '0;
1077:   end
1078: 
1079:   ////////////////
1080:   // Result mux //
1081:   ////////////////
1082: 
1083:   always_comb begin
1084:     result_o   = '0;
1085: 
1086:     unique case (operator_i)
1087:       // Bitwise Logic Operations (negate: RV32B)
1088:       ALU_XOR,  ALU_XNOR,
1089:       ALU_OR,   ALU_ORN,
1090:       ALU_AND,  ALU_ANDN: result_o = bwlogic_result;
1091: 
1092:       // Adder Operations
1093:       ALU_ADD,  ALU_SUB: result_o = adder_result;
1094: 
1095:       // Shift Operations
1096:       ALU_SLL,  ALU_SRL,
1097:       ALU_SRA,
1098:       // RV32B
1099:       ALU_SLO,  ALU_SRO: result_o = shift_result;
1100: 
1101:       // Shuffle Operations (RV32B)
1102:       ALU_SHFL, ALU_UNSHFL: result_o = shuffle_result;
1103: 
1104:       // Comparison Operations
1105:       ALU_EQ,   ALU_NE,
1106:       ALU_GE,   ALU_GEU,
1107:       ALU_LT,   ALU_LTU,
1108:       ALU_SLT,  ALU_SLTU: result_o = {31'h0,cmp_result};
1109: 
1110:       // MinMax Operations (RV32B)
1111:       ALU_MIN,  ALU_MAX,
1112:       ALU_MINU, ALU_MAXU: result_o = minmax_result;
1113: 
1114:       // Bitcount Operations (RV32B)
1115:       ALU_CLZ, ALU_CTZ,
1116:       ALU_PCNT: result_o = {26'h0, bitcnt_result};
1117: 
1118:       // Pack Operations (RV32B)
1119:       ALU_PACK, ALU_PACKH,
1120:       ALU_PACKU: result_o = pack_result;
1121: 
1122:       // Sign-Extend (RV32B)
1123:       ALU_SEXTB, ALU_SEXTH: result_o = sext_result;
1124: 
1125:       // Ternary Bitmanip Operations (RV32B)
1126:       ALU_CMIX, ALU_CMOV,
1127:       ALU_FSL,  ALU_FSR,
1128:       // Rotate Shift (RV32B)
1129:       ALU_ROL, ALU_ROR,
1130:       // Cyclic Redundancy Checks (RV32B)
1131:       ALU_CRC32_W, ALU_CRC32C_W,
1132:       ALU_CRC32_H, ALU_CRC32C_H,
1133:       ALU_CRC32_B, ALU_CRC32C_B: result_o = multicycle_result;
1134: 
1135:       // Single-Bit Bitmanip Operations (RV32B)
1136:       ALU_SBSET, ALU_SBCLR,
1137:       ALU_SBINV, ALU_SBEXT: result_o = singlebit_result;
1138: 
1139:       // Bit Extract / Deposit (RV32B)
1140:       ALU_BDEP:  result_o = butterfly_result;
1141:       ALU_BEXT:  result_o = invbutterfly_result;
1142: 
1143:       // General Reverse / Or-combine (RV32B)
1144:       ALU_GREV, ALU_GORC: result_o = butterfly_result;
1145: 
1146:       // Bit Field Place (RV32B)
1147:       ALU_BFP: result_o = bfp_result;
1148: 
1149:       // Carry-less Multiply Operations (RV32B)
1150:       ALU_CLMUL, ALU_CLMULR,
1151:       ALU_CLMULH: result_o = clmul_result;
1152: 
1153:       default: ;
1154:     endcase
1155:   end
1156: 
1157: endmodule
1158: