Skip to content

Commit 7f4570b

Browse files
authored
Merge pull request #7984 from tautschnig/features/tree-multipliers
Multiplication encoding: cleanup, Dadda, data
2 parents 25cb64d + 1699176 commit 7f4570b

File tree

2 files changed

+249
-37
lines changed

2 files changed

+249
-37
lines changed

src/solvers/flattening/bv_utils.cpp

+248-37
Original file line numberDiff line numberDiff line change
@@ -664,14 +664,15 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
664664
INVARIANT(a.size() == b.size(), "groups should be of equal size");
665665
INVARIANT(a.size() == c.size(), "groups should be of equal size");
666666

667-
bvt s(a.size()), t(a.size());
667+
bvt s, t(a.size(), const_literal(false));
668+
s.reserve(a.size());
668669

669670
for(std::size_t bit=0; bit<a.size(); bit++)
670671
{
671-
// \todo reformulate using full_adder
672-
s[bit]=prop.lxor(a[bit], prop.lxor(b[bit], c[bit]));
673-
t[bit]=(bit==0)?const_literal(false):
674-
carry(a[bit-1], b[bit-1], c[bit-1]);
672+
literalt carry_out;
673+
s.push_back(full_adder(a[bit], b[bit], c[bit], carry_out));
674+
if(bit + 1 < a.size())
675+
t[bit + 1] = carry_out;
675676
}
676677

677678
new_pps.push_back(std::move(s));
@@ -687,66 +688,276 @@ bvt bv_utilst::wallace_tree(const std::vector<bvt> &pps)
687688
}
688689
}
689690

690-
bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
691+
bvt bv_utilst::dadda_tree(const std::vector<bvt> &pps)
691692
{
692-
#if 1
693-
bvt op0=_op0, op1=_op1;
693+
PRECONDITION(!pps.empty());
694694

695-
if(is_constant(op1))
696-
std::swap(op0, op1);
695+
using columnt = std::list<literalt>;
696+
std::vector<columnt> columns(pps.front().size());
697+
for(const auto &pp : pps)
698+
{
699+
PRECONDITION(pp.size() == pps.front().size());
700+
for(std::size_t i = 0; i < pp.size(); ++i)
701+
{
702+
if(!pp[i].is_false())
703+
columns[i].push_back(pp[i]);
704+
}
705+
}
697706

698-
bvt product;
699-
product.resize(op0.size());
707+
std::list<std::size_t> dadda_sequence;
708+
for(std::size_t d = 2; d < pps.front().size(); d = (d * 3) / 2)
709+
dadda_sequence.push_front(d);
700710

701-
for(std::size_t i=0; i<product.size(); i++)
702-
product[i]=const_literal(false);
703-
704-
for(std::size_t sum=0; sum<op0.size(); sum++)
705-
if(op0[sum]!=const_literal(false))
711+
for(auto d : dadda_sequence)
712+
{
713+
for(auto col_it = columns.begin(); col_it != columns.end();) // no ++col_it
706714
{
707-
bvt tmpop = zeros(sum);
708-
tmpop.reserve(op0.size());
715+
if(col_it->size() <= d)
716+
++col_it;
717+
else if(col_it->size() == d + 1)
718+
{
719+
// Dadda prescribes a half adder here, but OPTIMAL_FULL_ADDER makes
720+
// full_adder actually build a half adder when carry-in is false.
721+
literalt carry_out;
722+
col_it->push_back(full_adder(
723+
col_it->front(),
724+
*std::next(col_it->begin()),
725+
const_literal(false),
726+
carry_out));
727+
col_it->pop_front();
728+
col_it->pop_front();
729+
if(std::next(col_it) != columns.end())
730+
std::next(col_it)->push_back(carry_out);
731+
}
732+
else
733+
{
734+
// We could also experiment with n:2 compressors (for n > 3, n=3 is the
735+
// full adder as used below) that use circuits with lower gate count
736+
// than just combining multiple full adders.
737+
literalt carry_out;
738+
col_it->push_back(full_adder(
739+
col_it->front(),
740+
*std::next(col_it->begin()),
741+
*std::next(std::next(col_it->begin())),
742+
carry_out));
743+
col_it->pop_front();
744+
col_it->pop_front();
745+
col_it->pop_front();
746+
if(std::next(col_it) != columns.end())
747+
std::next(col_it)->push_back(carry_out);
748+
}
749+
}
750+
}
709751

710-
for(std::size_t idx=sum; idx<op0.size(); idx++)
711-
tmpop.push_back(prop.land(op1[idx-sum], op0[sum]));
752+
bvt a, b;
753+
a.reserve(pps.front().size());
754+
b.reserve(pps.front().size());
712755

713-
product=add(product, tmpop);
756+
for(const auto &col : columns)
757+
{
758+
PRECONDITION(col.size() <= 2);
759+
switch(col.size())
760+
{
761+
case 0:
762+
a.push_back(const_literal(false));
763+
b.push_back(const_literal(false));
764+
break;
765+
case 1:
766+
a.push_back(col.front());
767+
b.push_back(const_literal(false));
768+
break;
769+
case 2:
770+
a.push_back(col.front());
771+
b.push_back(col.back());
714772
}
773+
}
715774

716-
return product;
717-
#else
718-
// Wallace tree multiplier. This is disabled, as runtimes have
719-
// been observed to go up by 5%-10%, and on some models even by 20%.
775+
return add(a, b);
776+
}
720777

721-
// build the usual quadratic number of partial products
778+
// Wallace tree multiplier. This is disabled, as runtimes have
779+
// been observed to go up by 5%-10%, and on some models even by 20%.
780+
// #define WALLACE_TREE
781+
// Dadda' reduction scheme. This yields a smaller formula size than Wallace
782+
// trees (and also the default addition scheme), but remains disabled as it
783+
// isn't consistently more performant either.
784+
// #define DADDA_TREE
785+
786+
// The following examples demonstrate the performance differences (with a
787+
// time-out of 7200 seconds):
788+
//
789+
// #ifndef BITWIDTH
790+
// #define BITWIDTH 8
791+
// #endif
792+
//
793+
// int main()
794+
// {
795+
// __CPROVER_bitvector[BITWIDTH] a, b;
796+
// __CPROVER_assert(a * 3 == a + a + a, "multiplication by 3");
797+
// __CPROVER_assert(3 * a == a + a + a, "multiplication by 3");
798+
// __CPROVER_bitvector[BITWIDTH] ab = a * b;
799+
// __CPROVER_bitvector[BITWIDTH * 2] ab_check =
800+
// (__CPROVER_bitvector[BITWIDTH * 2])a *
801+
// (__CPROVER_bitvector[BITWIDTH * 2])b;
802+
// __CPROVER_assert(
803+
// ab == (__CPROVER_bitvector[BITWIDTH])ab_check, "should pass");
804+
// }
805+
//
806+
// |----|-----------------------------|-----------------------------|
807+
// | | CaDiCaL | MiniSat2 |
808+
// |----|-----------------------------|-----------------------------|
809+
// | n | No tree | Wallace | Dadda | No tree | Wallace | Dadda |
810+
// |----|---------|---------|---------|---------|---------|---------|
811+
// | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
812+
// | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
813+
// | 3 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 |
814+
// | 4 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
815+
// | 5 | 0.04 | 0.04 | 0.04 | 0.01 | 0.01 | 0.01 |
816+
// | 6 | 0.11 | 0.13 | 0.12 | 0.04 | 0.05 | 0.06 |
817+
// | 7 | 0.28 | 0.46 | 0.44 | 0.15 | 0.27 | 0.31 |
818+
// | 8 | 0.50 | 1.55 | 1.09 | 0.90 | 1.06 | 1.36 |
819+
// | 9 | 2.22 | 3.63 | 2.67 | 3.40 | 5.85 | 3.44 |
820+
// | 10 | 2.79 | 4.81 | 4.69 | 4.32 | 28.41 | 28.01 |
821+
// | 11 | 8.48 | 4.45 | 11.99 | 38.24 | 98.55 | 86.46 |
822+
// | 12 | 14.52 | 4.86 | 5.80 | 115.00 | 140.11 | 461.70 |
823+
// | 13 | 33.62 | 5.56 | 6.14 | 210.24 | 805.59 | 609.01 |
824+
// | 14 | 37.23 | 6.11 | 8.01 | 776.75 | 689.96 | 6153.82 |
825+
// | 15 | 108.65 | 7.86 | 14.72 | 1048.92 | 1421.74 | 6881.78 |
826+
// | 16 | 102.61 | 14.08 | 18.54 | 1628.01 | timeout | 1943.85 |
827+
// | 17 | 117.89 | 18.53 | 9.19 | 4148.73 | timeout | timeout |
828+
// | 18 | 209.40 | 7.97 | 7.74 | 2760.51 | timeout | timeout |
829+
// | 19 | 168.21 | 18.04 | 15.00 | 2514.21 | timeout | timeout |
830+
// | 20 | 566.76 | 12.68 | 22.47 | 2609.09 | timeout | timeout |
831+
// | 21 | 786.31 | 23.80 | 23.80 | 2232.77 | timeout | timeout |
832+
// | 22 | 817.74 | 17.64 | 22.53 | 3866.70 | timeout | timeout |
833+
// | 23 | 1102.76 | 24.19 | 26.37 | timeout | timeout | timeout |
834+
// | 24 | 1319.59 | 27.37 | 29.95 | timeout | timeout | timeout |
835+
// | 25 | 1786.11 | 27.10 | 29.94 | timeout | timeout | timeout |
836+
// | 26 | 1952.18 | 31.08 | 33.95 | timeout | timeout | timeout |
837+
// | 27 | 6908.48 | 27.92 | 34.94 | timeout | timeout | timeout |
838+
// | 28 | 6919.34 | 36.63 | 33.78 | timeout | timeout | timeout |
839+
// | 29 | timeout | 27.95 | 40.69 | timeout | timeout | timeout |
840+
// | 30 | timeout | 36.94 | 31.59 | timeout | timeout | timeout |
841+
// | 31 | timeout | 38.41 | 40.04 | timeout | timeout | timeout |
842+
// | 32 | timeout | 33.06 | 91.38 | timeout | timeout | timeout |
843+
// |----|---------|---------|---------|---------|---------|---------|
844+
//
845+
// In summary, both Wallace tree and Dadda reduction are substantially more
846+
// efficient with CaDiCaL on the above code for all bit width > 11, but somewhat
847+
// slower with MiniSat.
848+
//
849+
// #ifndef BITWIDTH
850+
// #define BITWIDTH 8
851+
// #endif
852+
//
853+
// int main()
854+
// {
855+
// __CPROVER_bitvector[BITWIDTH] a, b, c, ab, bc, abc;
856+
// ab = a * b;
857+
// __CPROVER_bitvector[BITWIDTH * 2] ab_check =
858+
// (__CPROVER_bitvector[BITWIDTH * 2])a *
859+
// (__CPROVER_bitvector[BITWIDTH * 2])b;
860+
// __CPROVER_assume(ab == ab_check);
861+
// bc = b * c;
862+
// __CPROVER_bitvector[BITWIDTH * 2] bc_check =
863+
// (__CPROVER_bitvector[BITWIDTH * 2])b *
864+
// (__CPROVER_bitvector[BITWIDTH * 2])c;
865+
// __CPROVER_assume(bc == bc_check);
866+
// abc = ab * c;
867+
// __CPROVER_bitvector[BITWIDTH * 2] abc_check =
868+
// (__CPROVER_bitvector[BITWIDTH * 2])ab *
869+
// (__CPROVER_bitvector[BITWIDTH * 2])c;
870+
// __CPROVER_assume(abc == abc_check);
871+
// __CPROVER_assert(abc == a * bc, "associativity");
872+
// }
873+
//
874+
// |----|-----------------------------|-----------------------------|
875+
// | | CaDiCaL | MiniSat2 |
876+
// |----|-----------------------------|-----------------------------|
877+
// | n | No tree | Wallace | Dadda | No tree | Wallace | Dadda |
878+
// |----|---------|---------|---------|---------|---------|---------|
879+
// | 1 | 0.00 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 |
880+
// | 2 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
881+
// | 3 | 0.02 | 0.03 | 0.03 | 0.01 | 0.01 | 0.01 |
882+
// | 4 | 0.05 | 0.07 | 0.06 | 0.02 | 0.02 | 0.02 |
883+
// | 5 | 0.17 | 0.18 | 0.14 | 0.04 | 0.04 | 0.04 |
884+
// | 6 | 0.50 | 0.63 | 0.63 | 0.13 | 0.14 | 0.13 |
885+
// | 7 | 1.01 | 1.15 | 0.90 | 0.43 | 0.47 | 0.47 |
886+
// | 8 | 1.56 | 1.76 | 1.75 | 3.35 | 2.39 | 1.75 |
887+
// | 9 | 2.07 | 2.48 | 1.75 | 11.20 | 12.64 | 7.94 |
888+
// | 10 | 3.58 | 3.88 | 3.54 | 20.23 | 23.49 | 15.66 |
889+
// | 11 | 5.84 | 7.46 | 5.31 | 49.32 | 39.86 | 44.15 |
890+
// | 12 | 11.71 | 16.85 | 13.40 | 69.32 | 156.57 | 46.50 |
891+
// | 13 | 33.22 | 41.95 | 34.43 | 250.91 | 294.73 | 79.47 |
892+
// | 14 | 76.27 | 109.59 | 84.49 | 226.98 | 259.84 | 258.08 |
893+
// | 15 | 220.01 | 330.10 | 267.11 | 783.73 | 1160.47 | 1262.91 |
894+
// | 16 | 591.91 | 981.16 | 808.33 | 2712.20 | 4286.60 | timeout |
895+
// | 17 | 1634.97 | 2574.81 | 2006.27 | timeout | timeout | timeout |
896+
// | 18 | 4680.16 | timeout | 6704.35 | timeout | timeout | timeout |
897+
// | 19 | timeout | timeout | timeout | timeout | timeout | timeout |
898+
// | 20 | timeout | timeout | timeout | timeout | timeout | timeout |
899+
// | 21 | timeout | timeout | timeout | timeout | timeout | timeout |
900+
// | 22 | timeout | timeout | timeout | timeout | timeout | timeout |
901+
// | 23 | timeout | timeout | timeout | timeout | timeout | timeout |
902+
// | 24 | timeout | timeout | timeout | timeout | timeout | timeout |
903+
// | 25 | timeout | timeout | timeout | timeout | timeout | timeout |
904+
// | 26 | timeout | timeout | timeout | timeout | timeout | timeout |
905+
// | 27 | timeout | timeout | timeout | timeout | timeout | timeout |
906+
// | 28 | timeout | timeout | timeout | timeout | timeout | timeout |
907+
// | 29 | timeout | timeout | timeout | timeout | timeout | timeout |
908+
// | 30 | timeout | timeout | timeout | timeout | timeout | timeout |
909+
// | 31 | timeout | timeout | timeout | timeout | timeout | timeout |
910+
// | 32 | timeout | timeout | timeout | timeout | timeout | timeout |
911+
// |----|---------|---------|---------|---------|---------|---------|
912+
//
913+
// In summary, Wallace tree reduction is slower for both solvers and all bit
914+
// widths (except BITWIDTH==8 with MiniSat2). Dadda multipliers get closer to
915+
// our multiplier that's not using a tree reduction scheme, but aren't uniformly
916+
// better either.
722917

918+
bvt bv_utilst::unsigned_multiplier(const bvt &_op0, const bvt &_op1)
919+
{
723920
bvt op0=_op0, op1=_op1;
724921

725922
if(is_constant(op1))
726923
std::swap(op0, op1);
727924

925+
// build the usual quadratic number of partial products
728926
std::vector<bvt> pps;
729927
pps.reserve(op0.size());
730928

731929
for(std::size_t bit=0; bit<op0.size(); bit++)
732-
if(op0[bit]!=const_literal(false))
733-
{
734-
// zeros according to weight
735-
bvt pp = zeros(bit);
736-
pp.reserve(op0.size());
930+
{
931+
if(op0[bit] == const_literal(false))
932+
continue;
737933

738-
for(std::size_t idx=bit; idx<op0.size(); idx++)
739-
pp.push_back(prop.land(op1[idx-bit], op0[bit]));
934+
// zeros according to weight
935+
bvt pp = zeros(bit);
936+
pp.reserve(op0.size());
740937

741-
pps.push_back(pp);
742-
}
938+
for(std::size_t idx = bit; idx < op0.size(); idx++)
939+
pp.push_back(prop.land(op1[idx - bit], op0[bit]));
940+
941+
pps.push_back(pp);
942+
}
743943

744944
if(pps.empty())
745945
return zeros(op0.size());
746946
else
947+
{
948+
#ifdef WALLACE_TREE
747949
return wallace_tree(pps);
950+
#elif defined(DADDA_TREE)
951+
return dadda_tree(pps);
952+
#else
953+
bvt product = pps.front();
748954

749-
#endif
955+
for(auto it = std::next(pps.begin()); it != pps.end(); ++it)
956+
product = add(product, *it);
957+
958+
return product;
959+
#endif
960+
}
750961
}
751962

752963
bvt bv_utilst::unsigned_multiplier_no_overflow(

src/solvers/flattening/bv_utils.h

+1
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ class bv_utilst
244244
bvt cond_negate_no_overflow(const bvt &bv, const literalt cond);
245245

246246
bvt wallace_tree(const std::vector<bvt> &pps);
247+
bvt dadda_tree(const std::vector<bvt> &pps);
247248
};
248249

249250
#endif // CPROVER_SOLVERS_FLATTENING_BV_UTILS_H

0 commit comments

Comments
 (0)