Skip to content

Commit 1699176

Browse files
committed
Dadda multiplier
Implement Luigi Dadda (1965)'s reduction scheme, which uses fewer full adders than Wallace Trees. Still, performance is not strictly better than without a tree reduction scheme.
1 parent f46a8f3 commit 1699176

File tree

2 files changed

+177
-81
lines changed

2 files changed

+177
-81
lines changed

src/solvers/flattening/bv_utils.cpp

+176-81
Original file line numberDiff line numberDiff line change
@@ -688,9 +688,100 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
688688
}
689689
}
690690

691+
bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
692+
{
693+
PRECONDITION(!pps.empty());
694+
695+
using columnt = std::list<literalt>;
696+
std::vector<columnt> columns(pps.front().size());
697+
for(const auto &pp : pps)
698+
{
699+
PRECONDITION(pp.size() == pps.front().size());
700+
for(std::size_t i = 0; i < pp.size(); ++i)
701+
{
702+
if(!pp[i].is_false())
703+
columns[i].push_back(pp[i]);
704+
}
705+
}
706+
707+
std::list<std::size_t> dadda_sequence;
708+
for(std::size_t d = 2; d < pps.front().size(); d = (d * 3) / 2)
709+
dadda_sequence.push_front(d);
710+
711+
for(auto d : dadda_sequence)
712+
{
713+
for(auto col_it = columns.begin(); col_it != columns.end();) // no ++col_it
714+
{
715+
if(col_it->size() <= d)
716+
++col_it;
717+
else if(col_it->size() == d + 1)
718+
{
719+
// Dadda prescribes a half adder here, but OPTIMAL_FULL_ADDER makes
720+
// full_adder actually build a half adder when carry-in is false.
721+
literalt carry_out;
722+
col_it->push_back(full_adder(
723+
col_it->front(),
724+
*std::next(col_it->begin()),
725+
const_literal(false),
726+
carry_out));
727+
col_it->pop_front();
728+
col_it->pop_front();
729+
if(std::next(col_it) != columns.end())
730+
std::next(col_it)->push_back(carry_out);
731+
}
732+
else
733+
{
734+
// We could also experiment with n:2 compressors (for n > 3, n=3 is the
735+
// full adder as used below) that use circuits with lower gate count
736+
// than just combining multiple full adders.
737+
literalt carry_out;
738+
col_it->push_back(full_adder(
739+
col_it->front(),
740+
*std::next(col_it->begin()),
741+
*std::next(std::next(col_it->begin())),
742+
carry_out));
743+
col_it->pop_front();
744+
col_it->pop_front();
745+
col_it->pop_front();
746+
if(std::next(col_it) != columns.end())
747+
std::next(col_it)->push_back(carry_out);
748+
}
749+
}
750+
}
751+
752+
bvt a, b;
753+
a.reserve(pps.front().size());
754+
b.reserve(pps.front().size());
755+
756+
for(const auto &col : columns)
757+
{
758+
PRECONDITION(col.size() <= 2);
759+
switch(col.size())
760+
{
761+
case 0:
762+
a.push_back(const_literal(false));
763+
b.push_back(const_literal(false));
764+
break;
765+
case 1:
766+
a.push_back(col.front());
767+
b.push_back(const_literal(false));
768+
break;
769+
case 2:
770+
a.push_back(col.front());
771+
b.push_back(col.back());
772+
}
773+
}
774+
775+
return add(a, b);
776+
}
777+
691778
// Wallace tree multiplier. This is disabled, as runtimes have
692779
// been observed to go up by 5%-10%, and on some models even by 20%.
693780
// #define WALLACE_TREE
781+
// Dadda' reduction scheme. This yields a smaller formula size than Wallace
782+
// trees (and also the default addition scheme), but remains disabled as it
783+
// isn't consistently more performant either.
784+
// #define DADDA_TREE
694785

695786
// The following examples demonstrate the performance differences (with a
696787
// time-out of 7200 seconds):
@@ -712,48 +803,48 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
712803
// ab == (__CPROVER_bitvector[BITWIDTH])ab_check, "should pass");
713804
// }
714805
//
715-
// |----|-------------------|-------------------|
716-
// | | CaDiCaL | MiniSat2 |
717-
// |----|-------------------|-------------------|
718-
// | n | No tree | Wallace | No tree | Wallace |
719-
// |----|---------|---------|---------|---------|
720-
// | 1 | 0.00 | 0.00 | 0.00 | 0.00 |
721-
// | 2 | 0.00 | 0.00 | 0.00 | 0.00 |
722-
// | 3 | 0.01 | 0.01 | 0.00 | 0.00 |
723-
// | 4 | 0.01 | 0.01 | 0.01 | 0.01 |
724-
// | 5 | 0.04 | 0.04 | 0.01 | 0.01 |
725-
// | 6 | 0.11 | 0.13 | 0.04 | 0.05 |
726-
// | 7 | 0.28 | 0.46 | 0.15 | 0.27 |
727-
// | 8 | 0.50 | 1.55 | 0.90 | 1.06 |
728-
// | 9 | 2.22 | 3.63 | 3.40 | 5.85 |
729-
// | 10 | 2.79 | 4.81 | 4.32 | 28.41 |
730-
// | 11 | 8.48 | 4.45 | 38.24 | 98.55 |
731-
// | 12 | 14.52 | 4.86 | 115.00 | 140.11 |
732-
// | 13 | 33.62 | 5.56 | 210.24 | 805.59 |
733-
// | 14 | 37.23 | 6.11 | 776.75 | 689.96 |
734-
// | 15 | 108.65 | 7.86 | 1048.92 | 1421.74 |
735-
// | 16 | 102.61 | 14.08 | 1628.01 | timeout |
736-
// | 17 | 117.89 | 18.53 | 4148.73 | timeout |
737-
// | 18 | 209.40 | 7.97 | 2760.51 | timeout |
738-
// | 19 | 168.21 | 18.04 | 2514.21 | timeout |
739-
// | 20 | 566.76 | 12.68 | 2609.09 | timeout |
740-
// | 21 | 786.31 | 23.80 | 2232.77 | timeout |
741-
// | 22 | 817.74 | 17.64 | 3866.70 | timeout |
742-
// | 23 | 1102.76 | 24.19 | timeout | timeout |
743-
// | 24 | 1319.59 | 27.37 | timeout | timeout |
744-
// | 25 | 1786.11 | 27.10 | timeout | timeout |
745-
// | 26 | 1952.18 | 31.08 | timeout | timeout |
746-
// | 27 | 6908.48 | 27.92 | timeout | timeout |
747-
// | 28 | 6919.34 | 36.63 | timeout | timeout |
748-
// | 29 | timeout | 27.95 | timeout | timeout |
749-
// | 30 | timeout | 36.94 | timeout | timeout |
750-
// | 31 | timeout | 38.41 | timeout | timeout |
751-
// | 32 | timeout | 33.06 | timeout | timeout |
752-
// |----|---------|---------|---------|---------|
806+
// |----|-----------------------------|-----------------------------|
807+
// | | CaDiCaL | MiniSat2 |
808+
// |----|-----------------------------|-----------------------------|
809+
// | n | No tree | Wallace | Dadda | No tree | Wallace | Dadda |
810+
// |----|---------|---------|---------|---------|---------|---------|
811+
// | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
812+
// | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
813+
// | 3 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 |
814+
// | 4 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
815+
// | 5 | 0.04 | 0.04 | 0.04 | 0.01 | 0.01 | 0.01 |
816+
// | 6 | 0.11 | 0.13 | 0.12 | 0.04 | 0.05 | 0.06 |
817+
// | 7 | 0.28 | 0.46 | 0.44 | 0.15 | 0.27 | 0.31 |
818+
// | 8 | 0.50 | 1.55 | 1.09 | 0.90 | 1.06 | 1.36 |
819+
// | 9 | 2.22 | 3.63 | 2.67 | 3.40 | 5.85 | 3.44 |
820+
// | 10 | 2.79 | 4.81 | 4.69 | 4.32 | 28.41 | 28.01 |
821+
// | 11 | 8.48 | 4.45 | 11.99 | 38.24 | 98.55 | 86.46 |
822+
// | 12 | 14.52 | 4.86 | 5.80 | 115.00 | 140.11 | 461.70 |
823+
// | 13 | 33.62 | 5.56 | 6.14 | 210.24 | 805.59 | 609.01 |
824+
// | 14 | 37.23 | 6.11 | 8.01 | 776.75 | 689.96 | 6153.82 |
825+
// | 15 | 108.65 | 7.86 | 14.72 | 1048.92 | 1421.74 | 6881.78 |
826+
// | 16 | 102.61 | 14.08 | 18.54 | 1628.01 | timeout | 1943.85 |
827+
// | 17 | 117.89 | 18.53 | 9.19 | 4148.73 | timeout | timeout |
828+
// | 18 | 209.40 | 7.97 | 7.74 | 2760.51 | timeout | timeout |
829+
// | 19 | 168.21 | 18.04 | 15.00 | 2514.21 | timeout | timeout |
830+
// | 20 | 566.76 | 12.68 | 22.47 | 2609.09 | timeout | timeout |
831+
// | 21 | 786.31 | 23.80 | 23.80 | 2232.77 | timeout | timeout |
832+
// | 22 | 817.74 | 17.64 | 22.53 | 3866.70 | timeout | timeout |
833+
// | 23 | 1102.76 | 24.19 | 26.37 | timeout | timeout | timeout |
834+
// | 24 | 1319.59 | 27.37 | 29.95 | timeout | timeout | timeout |
835+
// | 25 | 1786.11 | 27.10 | 29.94 | timeout | timeout | timeout |
836+
// | 26 | 1952.18 | 31.08 | 33.95 | timeout | timeout | timeout |
837+
// | 27 | 6908.48 | 27.92 | 34.94 | timeout | timeout | timeout |
838+
// | 28 | 6919.34 | 36.63 | 33.78 | timeout | timeout | timeout |
839+
// | 29 | timeout | 27.95 | 40.69 | timeout | timeout | timeout |
840+
// | 30 | timeout | 36.94 | 31.59 | timeout | timeout | timeout |
841+
// | 31 | timeout | 38.41 | 40.04 | timeout | timeout | timeout |
842+
// | 32 | timeout | 33.06 | 91.38 | timeout | timeout | timeout |
843+
// |----|---------|---------|---------|---------|---------|---------|
753844
//
754-
// In summary, Wallace tree reduction is substantially more efficient with
755-
// CaDiCaL on the above code for all bit width >= 11, but somewhat slower with
756-
// MiniSat.
845+
// In summary, both Wallace tree and Dadda reduction are substantially more
846+
// efficient with CaDiCaL on the above code for all bit width > 11, but somewhat
847+
// slower with MiniSat.
757848
//
758849
// #ifndef BITWIDTH
759850
// #define BITWIDTH 8
@@ -780,47 +871,49 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
780871
// __CPROVER_assert(abc == a * bc, "associativity");
781872
// }
782873
//
783-
// |----|-------------------|-------------------|
784-
// | | CaDiCaL | MiniSat2 |
785-
// |----|-------------------|-------------------|
786-
// | n | No tree | Wallace | No tree | Wallace |
787-
// |----|---------|---------|---------|---------|
788-
// | 1 | 0.00 | 0.00 | 0.01 | 0.01 |
789-
// | 2 | 0.01 | 0.01 | 0.01 | 0.01 |
790-
// | 3 | 0.02 | 0.03 | 0.01 | 0.01 |
791-
// | 4 | 0.05 | 0.07 | 0.02 | 0.02 |
792-
// | 5 | 0.17 | 0.18 | 0.04 | 0.04 |
793-
// | 6 | 0.50 | 0.63 | 0.13 | 0.14 |
794-
// | 7 | 1.01 | 1.15 | 0.43 | 0.47 |
795-
// | 8 | 1.56 | 1.76 | 3.35 | 2.39 |
796-
// | 9 | 2.07 | 2.48 | 11.20 | 12.64 |
797-
// | 10 | 3.58 | 3.88 | 20.23 | 23.49 |
798-
// | 11 | 5.84 | 7.46 | 49.32 | 39.86 |
799-
// | 12 | 11.71 | 16.85 | 69.32 | 156.57 |
800-
// | 13 | 33.22 | 41.95 | 250.91 | 294.73 |
801-
// | 14 | 76.27 | 109.59 | 226.98 | 259.84 |
802-
// | 15 | 220.01 | 330.10 | 783.73 | 1160.47 |
803-
// | 16 | 591.91 | 981.16 | 2712.20 | 4286.60 |
804-
// | 17 | 1634.97 | 2574.81 | timeout | timeout |
805-
// | 18 | 4680.16 | timeout | timeout | timeout |
806-
// | 19 | timeout | timeout | timeout | timeout |
807-
// | 20 | timeout | timeout | timeout | timeout |
808-
// | 21 | timeout | timeout | timeout | timeout |
809-
// | 22 | timeout | timeout | timeout | timeout |
810-
// | 23 | timeout | timeout | timeout | timeout |
811-
// | 24 | timeout | timeout | timeout | timeout |
812-
// | 25 | timeout | timeout | timeout | timeout |
813-
// | 26 | timeout | timeout | timeout | timeout |
814-
// | 27 | timeout | timeout | timeout | timeout |
815-
// | 28 | timeout | timeout | timeout | timeout |
816-
// | 29 | timeout | timeout | timeout | timeout |
817-
// | 30 | timeout | timeout | timeout | timeout |
818-
// | 31 | timeout | timeout | timeout | timeout |
819-
// | 32 | timeout | timeout | timeout | timeout |
820-
// |----|---------|---------|---------|---------|
874+
// |----|-----------------------------|-----------------------------|
875+
// | | CaDiCaL | MiniSat2 |
876+
// |----|-----------------------------|-----------------------------|
877+
// | n | No tree | Wallace | Dadda | No tree | Wallace | Dadda |
878+
// |----|---------|---------|---------|---------|---------|---------|
879+
// | 1 | 0.00 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 |
880+
// | 2 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
881+
// | 3 | 0.02 | 0.03 | 0.03 | 0.01 | 0.01 | 0.01 |
882+
// | 4 | 0.05 | 0.07 | 0.06 | 0.02 | 0.02 | 0.02 |
883+
// | 5 | 0.17 | 0.18 | 0.14 | 0.04 | 0.04 | 0.04 |
884+
// | 6 | 0.50 | 0.63 | 0.63 | 0.13 | 0.14 | 0.13 |
885+
// | 7 | 1.01 | 1.15 | 0.90 | 0.43 | 0.47 | 0.47 |
886+
// | 8 | 1.56 | 1.76 | 1.75 | 3.35 | 2.39 | 1.75 |
887+
// | 9 | 2.07 | 2.48 | 1.75 | 11.20 | 12.64 | 7.94 |
888+
// | 10 | 3.58 | 3.88 | 3.54 | 20.23 | 23.49 | 15.66 |
889+
// | 11 | 5.84 | 7.46 | 5.31 | 49.32 | 39.86 | 44.15 |
890+
// | 12 | 11.71 | 16.85 | 13.40 | 69.32 | 156.57 | 46.50 |
891+
// | 13 | 33.22 | 41.95 | 34.43 | 250.91 | 294.73 | 79.47 |
892+
// | 14 | 76.27 | 109.59 | 84.49 | 226.98 | 259.84 | 258.08 |
893+
// | 15 | 220.01 | 330.10 | 267.11 | 783.73 | 1160.47 | 1262.91 |
894+
// | 16 | 591.91 | 981.16 | 808.33 | 2712.20 | 4286.60 | timeout |
895+
// | 17 | 1634.97 | 2574.81 | 2006.27 | timeout | timeout | timeout |
896+
// | 18 | 4680.16 | timeout | 6704.35 | timeout | timeout | timeout |
897+
// | 19 | timeout | timeout | timeout | timeout | timeout | timeout |
898+
// | 20 | timeout | timeout | timeout | timeout | timeout | timeout |
899+
// | 21 | timeout | timeout | timeout | timeout | timeout | timeout |
900+
// | 22 | timeout | timeout | timeout | timeout | timeout | timeout |
901+
// | 23 | timeout | timeout | timeout | timeout | timeout | timeout |
902+
// | 24 | timeout | timeout | timeout | timeout | timeout | timeout |
903+
// | 25 | timeout | timeout | timeout | timeout | timeout | timeout |
904+
// | 26 | timeout | timeout | timeout | timeout | timeout | timeout |
905+
// | 27 | timeout | timeout | timeout | timeout | timeout | timeout |
906+
// | 28 | timeout | timeout | timeout | timeout | timeout | timeout |
907+
// | 29 | timeout | timeout | timeout | timeout | timeout | timeout |
908+
// | 30 | timeout | timeout | timeout | timeout | timeout | timeout |
909+
// | 31 | timeout | timeout | timeout | timeout | timeout | timeout |
910+
// | 32 | timeout | timeout | timeout | timeout | timeout | timeout |
911+
// |----|---------|---------|---------|---------|---------|---------|
821912
//
822-
// In summary, Wallace tree reduction is slower for both solvers and all
823-
// bit widths (except BITWIDTH==8 with MiniSat2).
913+
// In summary, Wallace tree reduction is slower for both solvers and all bit
914+
// widths (except BITWIDTH==8 with MiniSat2). Dadda multipliers get closer to
915+
// our multiplier that's not using a tree reduction scheme, but aren't uniformly
916+
// better either.
824917

825918
bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
826919
{
@@ -854,6 +947,8 @@ bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
854947
{
855948
#ifdef WALLACE_TREE
856949
return wallace_tree(pps);
950+
#elif defined(DADDA_TREE)
951+
return dadda_tree(pps);
857952
#else
858953
bvt product = pps.front();
859954

src/solvers/flattening/bv_utils.h

+1
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ class bv_utilst
244244
bvt cond_negate_no_overflow(const bvt &bv, const literalt cond);
245245

246246
bvt wallace_tree(const std::vector<bvt> &pps);
247+
bvt dadda_tree(const std::vector<bvt> &pps);
247248
};
248249

249250
#endif // CPROVER_SOLVERS_FLATTENING_BV_UTILS_H

0 commit comments

Comments
 (0)