diff --git a/main.cpp b/main.cpp index a5505a6..bc6715e 100644 --- a/main.cpp +++ b/main.cpp @@ -1,802 +1,802 @@ #include #include #include #include #include #include #include #include #include //#include //#define DEBUG_TRACE #ifndef DEBUG_TRACE // This optimization allows to compute only the // 2n+1 independent components but it biases the // trace check since trace will trivially be forced // to 0. #define EP0_COMPUTE_OPTIM #endif constexpr mp_bitcnt_t default_prec = 512; mpf_class* all_ep0_p; //unsigned int* multiset_buffer; //unsigned int* combination_buffer; inline size_t get_ep0_size(size_t p) { size_t size = (p+1)*(p+2)/2; return size; } inline size_t get_ep0_idx(size_t p1, size_t p2, size_t p3) { size_t p = p1+p2+p3; size_t idx = (p3*(2*p+3-p3))/2+p2; return idx; } inline void get_ep0_coords(size_t p, size_t idx, size_t* p1, size_t* p2, size_t* p3) { size_t ep0_size = get_ep0_size(p); size_t reverse_idx = ep0_size - idx - 1; size_t A = (size_t)std::ceil( (-1.0 + std::sqrt(1 + 8*(reverse_idx+1)))/2.0 ); *p3 = p + 1 - A; *p2 = idx - (ep0_size - A*(A+1)/2 ); *p1 = p - *p3 - *p2; } inline mpf_class coeff_Np0(size_t p) { // (2p)!/(p!) mpf_class p2fac_over_pfac = 1.0; for(size_t fac = 2*p; fac > p; --fac) p2fac_over_pfac *= fac; // p! mpf_class pfac = 1.0; for(size_t fac = 2; fac <= p; ++fac) pfac *= fac; // 1/2^p mpf_class one_over_2p = std::pow(2.0, -(int)p); // sqrt( (2p)! / 2^p / (p!)^2 ) mpf_class result = sqrt(one_over_2p * p2fac_over_pfac / pfac); return result; } inline mpf_class arrangement_factor(size_t n, size_t m) { assert(m <= n/2); size_t m2 = n - 2*m; // Note: m <= n/2, thus m <= m2 mpf_class nfac_over_m2fac = 1.0; for(size_t fac = n; fac > m2; --fac) nfac_over_m2fac *= fac; mpf_class mfac = 1.0; for(size_t fac = 2; fac <= m; ++fac) mfac *= fac; mpf_class one_over_2m = std::pow(2.0, -(int)m); mpf_class result = one_over_2m * nfac_over_m2fac / mfac; return result; } mpf_class compute_ep0(size_t p1, size_t p2, size_t p3) { // Trivial case if((p1%2) + (p2%2) > 0) return 0.0; size_t p = p1+p2+p3; size_t k1 = p1/2; size_t k2 = p2/2; size_t k3 = p3/2; mpf_class result = 0.0; mpf_class sign = (k1+k2) % 2 == 0 ? -1.0 : 1.0; for(size_t m3 = 0; m3 <= k3; ++m3) { sign *= -1.0; mpf_class p1fac_over_k1fac = 1.0; for(size_t fac = p1; fac > k1; --fac) p1fac_over_k1fac *= fac; mpf_class p2fac_over_k2fac = 1.0; for(size_t fac = p2; fac > k2; --fac) p2fac_over_k2fac *= fac; assert( (p + p3 -2*m3 -1) % 2 == (2*p-1) % 2 ); // NOTE: Here the denominator is greater than numerator mpf_class one_over_double_fac_ratio = 1.0; for(size_t fac = 2*p-1; fac > p + p3 -2*m3 -1; fac -= 2 ) one_over_double_fac_ratio *= fac; mpf_class one_over_2k1k2 = std::pow(2.0, -(int)(k1+k2)); result += sign * one_over_2k1k2 / one_over_double_fac_ratio * p1fac_over_k1fac * p2fac_over_k2fac * arrangement_factor(p3, m3); } result *= coeff_Np0(p); return result; } #define max(a,b) (((a)<(b))?(b):(a)) void check_trace_of_ep0(size_t p, mpf_class* ep0_p) { std::cout << "=== Begin Check Trace (p: " << p << ") ===\n"; if(p < 2) { std::cout << " - Trace is only valide for tansor of rank p >= 2.\n"; return; } mpf_class max_error = 0.0; size_t n = p-2; size_t n1 = n; size_t n2 = 0; size_t n3 = 0; do { size_t idx1 = get_ep0_idx(n1+2,n2,n3); size_t idx2 = get_ep0_idx(n1,n2+2,n3); size_t idx3 = get_ep0_idx(n1,n2,n3+2); mpf_class trace = ep0_p[idx1]+ep0_p[idx2]+ep0_p[idx3]; //std::printf("Trace: %g (%g, %g, %g)\n", trace, ep0_p[idx1],ep0_p[idx2],ep0_p[idx3]); max_error = max(abs(trace), max_error); - - n2 = (++n2)%(n+1-n3); + + n2 = (n2+1)%(n+1-n3); if(n2 == 0) ++n3; n1 = n - n2 - n3; } while(n3 <= n); std::cout << " - Max Error: " << max_error << "\n"; } inline size_t get_ep0_offset_from_p(size_t p) { size_t offset = p*(p+1)*(p+2)/6; return offset; } inline size_t get_ep0_p_from_offseted_idx(size_t idx) { // NOTE: Exact solution is not precise enough due to // fp errors. This is why I choosed to go with Newton's root finding mpf_class x = 0.0; mpf_class offset_f = -1.0; while(abs(x-offset_f)>1e-4) // magic number, have been tested to be enough for p -> 1'000'000 { offset_f = x; x = (2*x*x*x + 3*x*x + 6*idx) / (3*x*x + 6*x + 2); //std::cout << "x = " << x << "\n"; } size_t offset = (size_t)(offset_f.get_d()); return offset; } inline size_t get_ep0_offset_from_offseted_idx(size_t idx) { return get_ep0_offset_from_p(get_ep0_p_from_offseted_idx(idx)); } void fill_ep0(size_t p, mpf_class* ep0_p) { size_t p1 = p; size_t p2 = 0; size_t p3 = 0; do { size_t idx = get_ep0_idx(p1,p2,p3); #ifdef EP0_COMPUTE_OPTIM if(idx <= 2*p) // <=> idx+1 < 2p+1 #endif { ep0_p[idx] = compute_ep0(p1, p2, p3); } #ifdef EP0_COMPUTE_OPTIM else #else if(false) #endif { assert(p3 >= 2); size_t idx1 = get_ep0_idx(p1+2,p2,p3-2); size_t idx2 = get_ep0_idx(p1,p2+2,p3-2); ep0_p[idx] = - ep0_p[idx1] - ep0_p[idx2]; } //if(ep0_p[idx] != 0.0) { //std::printf("- (%zu, %zu, %zu) is located at pos %zu", p1, p2, p3, idx); //std::cout << " -> value is " << ep0_p[idx] << "\n"; } - p2 = (++p2)%(p+1-p3); + p2 = (p2+1)%(p+1-p3); if(p2 == 0) ++p3; p1 = p - p2 - p3; } while(p3 <= p); } void DEBUG_clear_all_ep0(size_t p_max, mpf_class* all_ep0_p) { size_t upper_bound = get_ep0_offset_from_p(p_max+1); for(size_t i = 0; i < upper_bound; ++i) all_ep0_p[i] = -100.0; } void DEBUG_dump_all_ep0(size_t p_max, mpf_class* all_ep0_p) { std::cout << " === DUMP ALL ep0 === \n"; for(size_t p = 0; p <= p_max; ++p) { size_t line_counter = 0; for(size_t i = 0; i < get_ep0_size(p); ++i) { std::cout << all_ep0_p[get_ep0_offset_from_p(p)+i] << " "; ++line_counter; if(line_counter % 12 == 0) { std::cout << "\n"; } } if(p < p_max) std::cout << "\n -------- "; std::cout << "\n"; } } void fill_all_ep0(size_t p_max, mpf_class* all_ep0_p) { for(size_t p = 0; p <= p_max; ++p) { mpf_class* ep0_p = all_ep0_p + get_ep0_offset_from_p(p); fill_ep0(p, ep0_p); } } void check_trace_of_all_ep0(size_t p_max, mpf_class* all_ep0_p) { for(size_t p = 0; p <= p_max; ++p) check_trace_of_ep0(p, all_ep0_p+get_ep0_offset_from_p(p)); } void dump_mpf_to_buffer(mpf_class* value_p, int* prec_p, int* size_p, mp_exp_t* exp_p, mp_limb_t* limbs_p) { // NOTE: This procedure is only valid beacause mpf is guarantied to be of fixed precision *prec_p = value_p->get_mpf_t()->_mp_prec; assert(value_p->get_prec() == default_prec); *size_p = value_p->get_mpf_t()->_mp_size; *exp_p = value_p->get_mpf_t()->_mp_exp; - for(size_t i = 0; i < abs(*size_p); ++i) + for(size_t i = 0; i < (size_t)abs(*size_p); ++i) limbs_p[i] = *(value_p->get_mpf_t()->_mp_d+i); } void undump_mpf_from_buffer(mpf_class* value_p, int* prec_p, int* size_p, mp_exp_t* exp_p, mp_limb_t* limbs_p) { // NOTE: This procedure is only valid beacause mpf is guarantied to be of fixed precision value_p->get_mpf_t()->_mp_prec = *prec_p; value_p->get_mpf_t()->_mp_size = *size_p; value_p->get_mpf_t()->_mp_exp = *exp_p; - for(size_t i = 0; i < abs(*size_p); ++i) + for(size_t i = 0; i < (size_t)abs(*size_p); ++i) *(value_p->get_mpf_t()->_mp_d+i) = limbs_p[i]; } struct mpf_content_buffer { size_t size; // --- int* size_p; int* prec_p; mp_exp_t* exp_p; mp_limb_t* limbs_p; }; void init_mpf_content_buffer(mpf_content_buffer* buffer, size_t size) { buffer->size = size; buffer->size_p = new int[size]; buffer->prec_p = new int[size]; buffer->exp_p = new mp_exp_t[size]; buffer->limbs_p = new mp_limb_t[size*(default_prec + 1)]; } void clear_mpf_content_buffer(mpf_content_buffer* buffer) { delete[] buffer->limbs_p; delete[] buffer->exp_p; delete[] buffer->prec_p; delete[] buffer->size_p; buffer->size = 0; } void dump_array_of_mpf_to_buffer(mpf_class* value_p, mpf_content_buffer* buffer, size_t size) { for(size_t i = 0; i < size; ++i) { dump_mpf_to_buffer(value_p+i, buffer->prec_p+i, buffer->size_p+i, buffer->exp_p+i, buffer->limbs_p + i*(default_prec+1)); } } void undump_array_of_mpf_from_buffer(mpf_class* value_p, mpf_content_buffer* buffer, size_t size) { for(size_t i = 0; i < size; ++i) { undump_mpf_from_buffer(value_p+i, buffer->prec_p+i, buffer->size_p+i, buffer->exp_p+i, buffer->limbs_p + i*(default_prec+1)); } } // Thanks to Martin Broadhurst (http://www.martinbroadhurst.com/combinations-of-a-multiset.html) unsigned int next_multiset_combination(const unsigned int *multiset, unsigned int *ar, size_t n, size_t k) { bool finished = false; bool changed = false; size_t i; for (i = k - 1; !finished && !changed; i--) { if (ar[i] < multiset[i + (n - k)]) { /* Find the successor */ size_t j; for (j = 0; multiset[j] <= ar[i]; j++); /* Replace this element with it */ ar[i] = multiset[j]; if (i < k - 1) { /* Make the elements after it the same as this part of the multiset */ size_t l; for (l = i + 1, j = j + 1; l < k; l++, j++) { ar[l] = multiset[j]; } } changed = true; } finished = (i == 0); } if (!changed) { /* Reset to first combination */ for (i = 0; i < k; i++) { ar[i] = multiset[i]; } } return changed; } unsigned int* create_multiset(size_t n1, size_t n2, size_t n3, unsigned int* buffer) { // Buffer is supposed to be big enough! unsigned int* multiset = buffer; size_t i = 0; for(;i < n1; ++i) multiset[i] = 1; for(;i < n1+n2; ++i) multiset[i] = 2; for(;i < n1+n2+n3; ++i) multiset[i] = 3; return multiset; } unsigned int* create_first_combination_from_multiset(unsigned int* multiset, size_t k, unsigned int* buffer) { unsigned int* combination = buffer; for(size_t i = 0; i < k; ++i) combination[i] = multiset[i]; return combination; } void read_combination_of_multiset(unsigned int* combination, size_t p1, size_t p2, size_t p3, size_t* L1, size_t* L2, size_t* L3, size_t* R1, size_t* R2, size_t* R3) { size_t i = 0; *L1 = *L2 = *L3 = 0; - for(;combination[i]==1;++i) ++(*L1); *R1 = p1 - *L1; - for(;combination[i]==2;++i) ++(*L2); *R2 = p2 - *L2; - for(;combination[i]==3;++i) ++(*L3); *R3 = p3 - *L3; + for(;combination[i]==1;++i){ ++(*L1); } *R1 = p1 - *L1; + for(;combination[i]==2;++i){ ++(*L2); } *R2 = p2 - *L2; + for(;combination[i]==3;++i){ ++(*L3); } *R3 = p3 - *L3; } inline mpf_class binomial(size_t n, size_t p) { size_t p1 = (p > n-p) ? p : n-p; size_t p2 = n-p1; mpf_class p2fac = 1.0; for(size_t fac = 1; fac <= p2; ++fac) p2fac *= fac; mpf_class result = 1.0; for(size_t fac = n; fac > p1; --fac) result *= fac; result /= p2fac; return result; } inline mpf_class trinomial(size_t n1, size_t n2, size_t n3) { // This ensure that n1_ is the greatest of the three size_t n1_, n2_, n3_; if(n1 >= n2 && n1 >= n3) { n1_ = n1; n2_ = n2; n3_ = n3; } else if(n2 >= n1 && n2 >= n3) { n1_ = n2; n2_ = n1; n3_ = n3; } else { n1_ = n3; n2_ = n1; n3_ = n2; } mpf_class n2fac = 1.0; mpf_class n3fac = 1.0; for(size_t fac = 1; fac <= n2_; ++fac) n2fac *= fac; for(size_t fac = 1; fac <= n3_; ++fac) n3fac *= fac; mpf_class multinomial = 1.0; for(size_t fac = n1+n2+n3; fac > n1_; --fac) multinomial *= fac; multinomial /= (n2fac*n3fac); return multinomial; } mpf_class compute_EP0S(size_t p, size_t n, size_t pp, size_t p1, size_t p2, size_t p3) { assert(p1+p2+p3 == p + pp - 2*n); // Compute the symmetric part of (ep0 .n ep'0) mpf_class result = 0.0; // Sum over k-combinations: size_t k = p-n; if(k == 0) return 1.0; // NOTE(Sam): This allocation is suboptimal but I had issue with the crapy allocator // I implemented eralier and it works like that. I'll try to optimize it if needed // in the future. unsigned int* multiset = new unsigned int[p1+p2+p3]; unsigned int* combination = new unsigned int[k]; multiset = create_multiset(p1,p2,p3, multiset); combination = create_first_combination_from_multiset(multiset, k, combination); /* std::cout << "p1+p2+p3="< pp-n) { upper_left = p-n; upper_right = pp-n; } else { upper_left = pp-n; upper_right = p-n; } mpf_class lower_fac_over_upper_left_fac = 1.0; for(size_t fac = lower; fac > upper_left; --fac) lower_fac_over_upper_left_fac *= fac; mpf_class upper_right_fac = 1.0; for(size_t fac = 1; fac <= upper_right; ++fac) upper_right_fac *= fac; result *= upper_right_fac / lower_fac_over_upper_left_fac; return result; } mpf_class compute_EP0(size_t p, size_t n, size_t pp, size_t p1, size_t p2, size_t p3) { // Compute the traceless and symmetric part of (ep0 .n ep'0) mpf_class result = 0.0; size_t k1 = p1/2; size_t k2 = p2/2; size_t k3 = p3/2; for(size_t m1 = 0; m1 <= k1; ++m1) for(size_t m2 = 0; m2 <= k2; ++m2) for(size_t m3 = 0; m3 <= k3; ++m3) { mpf_class sign = ( (m1+m2+m3)%2 == 0 ? 1.0 : -1.0 ); mpf_class arr_factor_1 = arrangement_factor(p1, m1); mpf_class arr_factor_2 = arrangement_factor(p2, m2); mpf_class arr_factor_3 = arrangement_factor(p3, m3); mpf_class prefactor = sign * arr_factor_1 * arr_factor_2 * arr_factor_3; if( (2*(p+pp-2*n) - 2*(m1+m2+m3)-1) % 2 == (2*(p+pp-2*n)-1) % 2) { mpf_class one_over_double_fac = 1.0; for(size_t fac = (2*(p+pp-2*n)-1); fac > (2*(p+pp-2*n)-2*(m1+m2+m3)-1); --fac) one_over_double_fac *= fac; prefactor /= one_over_double_fac; } else { mpf_class numerator = 1.0; for(size_t fac = 1; fac <= (2*(p+pp-2*n)-2*(m1+m2+m3)-1); ++fac) numerator *= fac; mpf_class denominator = 1.0; for(size_t fac = 1; fac <= (2*(p+pp-2*n)-1); ++fac) denominator *= fac; prefactor *= numerator/denominator; } // Sum with h1+h2+h3=m size_t h1 = m1+m2+m3; size_t h2 = 0; size_t h3 = 0; do { mpf_class multinomial = trinomial(h1,h2,h3); result += prefactor * multinomial * compute_EP0S(p,n,pp, p1 - 2*(m1-h1), p2 - 2*(m2-h2), p3 - 2*(m3-h3) ); - h2 = (++h2)%(m1+m2+m3+1-h3); + h2 = (h2+1)%(m1+m2+m3+1-h3); if(h2 == 0) ++h3; h1 = m1+m2+m3 - h2 - h3; } while(h3 <= m1+m2+m3); // End sum with h1+h2+h3=m } return result; } mpf_class compute_full_tensor_contraction(size_t p, size_t n, size_t pp) { size_t P = p+pp-2*n; mpf_class result = 0.0; // Sum with p1+p2+p3=p+pp-2n size_t p1 = P; size_t p2 = 0; size_t p3 = 0; do { mpf_class multinomial = trinomial(p1,p2,p3); size_t idx1 = get_ep0_offset_from_p(P) + get_ep0_idx(p1,p2,p3); result += multinomial*all_ep0_p[idx1]*compute_EP0(p,n,pp, p1,p2,p3); - p2 = (++p2)%(P+1-p3); + p2 = (p2+1)%(P+1-p3); if(p2 == 0) ++p3; p1 = P - p2 - p3; } while(p3 <= P); // End sum with p1+p2+p3=p+pp-2n return result; } int main() { mpf_set_default_prec(default_prec); std::cout << "Default precision: " << mpf_get_default_prec() << "\n"; size_t p_max = 10; size_t all_ep0_size = get_ep0_offset_from_p(p_max+1); all_ep0_p = new mpf_class[all_ep0_size]; std::printf("For all ep0 up to p=%zu we need %zu independent variables.\n", p_max, all_ep0_size); DEBUG_clear_all_ep0(p_max, all_ep0_p); fill_all_ep0(p_max, all_ep0_p); #ifdef DEBUG_TRACE check_trace_of_all_ep0(p_max, all_ep0_p); #endif //* for(size_t i = 0; i < all_ep0_size; ++i) if(all_ep0_p[i] == -100.0) std::cout << "Found " << all_ep0_p[i] << "\n"; //*/ // NOTE(Sam): This will be indispensable for MPI communication /* mpf_content_buffer buffer; init_mpf_content_buffer(&buffer, all_ep0_size); dump_array_of_mpf_to_buffer(all_ep0_p, &buffer, all_ep0_size); DEBUG_dump_all_ep0(p_max, all_ep0_p); DEBUG_clear_all_ep0(p_max, all_ep0_p); DEBUG_dump_all_ep0(p_max, all_ep0_p); undump_array_of_mpf_from_buffer(all_ep0_p, &buffer, all_ep0_size); DEBUG_dump_all_ep0(p_max, all_ep0_p); clear_mpf_content_buffer(&buffer); //*/ //multiset_buffer = new unsigned int[2*p_max]; //combination_buffer = new unsigned int[2*p_max]; //* mpf_content_buffer buffer; init_mpf_content_buffer(&buffer, all_ep0_size); mp_exp_t exp; std::cout << "0." << compute_full_tensor_contraction(1,1,1).get_str(exp) << " x10^("<> p >> ignore >> n >> ignore >> pp >> ignore >> value >> ignore >> timing ) ) { break; } //std::cout << "0." << compute_full_tensor_contraction(1,1,1).get_str(exp) << " x10^("< ms_timing = t2 - t1; //std::cout << "0." << compute_full_tensor_contraction(1,1,1).get_str(exp) << " x10^("< 4) break; } mathematica_csv.close(); //DEBUG_dump_all_ep0(p_max, all_ep0_p); std::cout << "0." << compute_full_tensor_contraction(1,1,1).get_str(exp) << " x10^("<