与项目Euler的速度比较：C vs python vs erlang vs haskell

Speed comparison with Project Euler: C vs Python vs Erlang vs Haskell

我将ProjectEuler中的问题12作为编程练习，并比较我在C、Python、Erlang和Haskell中的(当然不是最佳的)实现。为了获得更高的执行时间，我搜索第一个除数大于1000的三角形数，而不是原始问题中所述的500。

结果如下：

丙：

1
2
3
4
5
6
7

lorenzo@enzo:~/erlang$ gcc -lm -o euler12.bin euler12.c
lorenzo@enzo:~/erlang$ time ./euler12.bin
842161320

real 0m11.074s
user 0m11.070s
sys 0m0.000s

Python：

1
2
3
4
5
6

lorenzo@enzo:~/erlang$ time ./euler12.py
842161320

real 1m16.632s
user 1m16.370s
sys 0m0.250s

带pypy的python：

1
2
3
4
5
6

lorenzo@enzo:~/Downloads/pypy-c-jit-43780-b590cf6de419-linux64/bin$ time ./pypy /home/lorenzo/erlang/euler12.py
842161320

real 0m13.082s
user 0m13.050s
sys 0m0.020s

Erlang：

1
2
3
4
5
6
7
8
9
10

lorenzo@enzo:~/erlang$ erlc euler12.erl
lorenzo@enzo:~/erlang$ time erl -s euler12 solve
Erlang R13B03 (erts-5.7.4) [source] [64-bit] [smp:4:4] [rq:4] [async-threads:0] [hipe] [kernel-poll:false]

Eshell V5.7.4 (abort with ^G)
1> 842161320

real 0m48.259s
user 0m48.070s
sys 0m0.020s

Haskell：

1
2
3
4
5
6
7
8
9

lorenzo@enzo:~/erlang$ ghc euler12.hs -o euler12.hsx
[1 of 1] Compiling Main ( euler12.hs, euler12.o )
Linking euler12.hsx ...
lorenzo@enzo:~/erlang$ time ./euler12.hsx
842161320

real 2m37.326s
user 2m37.240s
sys 0m0.080s

总结：

C：100%
Python：692%(118%有Pypy)
二郎：436%(135%归功于理查)
哈斯克尔：1421%

我认为c有一个很大的优势，因为它使用long进行计算，而不像其他三个整数那样使用任意长度的整数。另外，它不需要先加载运行时(其他的呢？).

问题1：erlang、python和haskell是因为使用了任意长度的整数而失去速度，还是只要这些值小于MAXINT，它们就不会失去速度？

问题2：为什么哈斯克尔这么慢？是否有关闭制动器的编译器标志，或者它是我的实现？(后者很有可能，因为哈斯克尔是一本给我盖了七个印章的书。)

问题3：你能给我一些提示，如何在不改变我确定因素的方式的情况下优化这些实现吗？以任何方式优化：更好、更快、更"本机"的语言。

编辑：

问题4：我的函数实现是否允许LCO(最后一次调用优化，即尾部递归消除)，从而避免在调用堆栈中添加不必要的帧？

虽然我不得不承认我的haskell和erlang知识非常有限，但我确实尝试在四种语言中尽可能地实现相同的算法。

使用的源代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

#include <stdio.h>
#include <math.h>

int factorCount (long n)
{
double square = sqrt (n);
int isquare = (int) square;
int count = isquare == square ? -1 : 0;
long candidate;
for (candidate = 1; candidate <= isquare; candidate ++)
if (0 == n % candidate) count += 2;
return count;
}

int main ()
{
long triangle = 1;
int index = 1;
while (factorCount (triangle) < 1001)
{
index ++;
triangle += index;
}
printf ("%ld
", triangle);
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

#! /usr/bin/env python3.2

import math

def factorCount (n):
square = math.sqrt (n)
isquare = int (square)
count = -1 if isquare == square else 0
for candidate in range (1, isquare + 1):
if not n % candidate: count += 2
return count

triangle = 1
index = 1
while factorCount (triangle) < 1001:
index += 1
triangle += index

print (triangle)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

-module (euler12).
-compile (export_all).

factorCount (Number) -> factorCount (Number, math:sqrt (Number), 1, 0).

factorCount (_, Sqrt, Candidate, Count) when Candidate > Sqrt -> Count;

factorCount (_, Sqrt, Candidate, Count) when Candidate == Sqrt -> Count + 1;

factorCount (Number, Sqrt, Candidate, Count) ->
case Number rem Candidate of
0 -> factorCount (Number, Sqrt, Candidate + 1, Count + 2);
_ -> factorCount (Number, Sqrt, Candidate + 1, Count)
end.

nextTriangle (Index, Triangle) ->
Count = factorCount (Triangle),
if
Count > 1000 -> Triangle;
true -> nextTriangle (Index + 1, Triangle + Index + 1)
end.

solve () ->
io:format ("~p~n", [nextTriangle (1, 1) ] ),
halt (0).

1
2
3
4
5
6
7
8
9
10
11
12
13
14

factorCount number = factorCount' number isquare 1 0 - (fromEnum $ square == fromIntegral isquare)
where square = sqrt $ fromIntegral number
isquare = floor square

factorCount' number sqrt candidate count
| fromIntegral candidate > sqrt = count
| number `mod` candidate == 0 = factorCount' number sqrt (candidate + 1) (count + 2)
| otherwise = factorCount' number sqrt (candidate + 1) count

nextTriangle index triangle
| factorCount triangle > 1000 = triangle
| otherwise = nextTriangle (index + 1) (triangle + index + 1)

main = print $ nextTriangle 1 1

相关讨论

有一个网页只显示语言的速度差异。根据你的数据和爆炸页面，我认为你的算法不如爆炸页面中使用的其他算法好。
@Jochen(和Seth)并不认为C速度快或者很棒，但它被认为是很容易编写性能代码的(这可能不是真的，但大多数程序似乎都能，所以是真的)。正如我在回答中所探讨的，并且发现随着时间的推移，程序员的技能和所选语言的公共优化知识是非常重要的(尤其是对haskell)。
您应该能够通过放置主模块级代码(triangle = 1来提高python代码的速度。print(triangle)变成一个函数(main然后在末尾做if __name__ =="__main__": main()。(很像您的C代码。)这将使局部变量引用更快。
考虑到我在回答这个问题时的建议，我请求考虑到我的原因，重新尝试这个基准测试。
在提出诸如1和2之类的问题之前，请优化您的代码，或让其他人帮助您完成这项工作。
"1742%"—百分比夸大了差异(它们使差异看起来比实际大100倍)，而且可能会令人困惑。而是说慢了多少倍。
@Muzaaya我考虑了您的输入，并分别用定时器：tc和time.clock测量了程序。没有真正的变化。真正的刹车是使用python3.2而不是pypy，而不是用+native编译erlc。不幸的是，我不能用哈斯克尔复制托马斯的速度提升。相同的编译器标志、相同的代码，并且我的机器上的执行时间没有变化。
@iGouy 1742%是打字错误。它是1421%。"而是说慢了多少倍"。你只需要在数字中加上一个小数点。1421%=慢14.21倍。100%＝1。
@过度无聊-为什么一开始就夸大差异？
LCO是什么意思？
@Kizzx2上次调用优化。en.wikipedia.org/wiki/tail_电话
@hyperboreus哦，这对我来说是个新名字，因为ol’tail递归：p在那个页面上找不到任何对"lco"的引用，但是无论如何。
刚刚用mathematica检查过——它需要0.25秒(用c，这里需要6秒)，代码是：Euler12[x_Integer] := Module[{s = 1}, For[i = 2, DivisorSigma[0, s] < x, i++, s += i]; s]。好哇！
还有其他人记得C和大会之间的战争吗？"当然！你可以用C语言写代码快10倍，但你的C语言能运行得这么快吗？"我相信机器代码和装配之间也发生过同样的战斗。
肯定是JS。但我的重点是学习如何从不同的语言中获得一些速度，以便能够在未来的生产代码中应用学习项。
@JS：可能不会，因为汇编只是一组您输入的助记键，而不是原始的二进制机器代码——通常它们之间有1-1的对应关系。
我刚刚写了一个C++实现，在212 ms中在Windows上执行并产生了正确的答案。OPS最初的想法是比较执行时间，但是我要指出的是，你如何在C++中聪明，并以这样的方式在内存中对齐问题，这将比在任何解释的局域网中执行的更快。标尺。我对haskell不是很精通，但我不相信你对什么时候进入哪个缓存有那么多的控制。
@格伦：你为什么不分享你的密码？
@布鲁诺，因为我的代码实际上没有任何聪明的地方！如果有几个数量级的改进，然后再添加一些聪明的部分，就不会很好地测试。
@j0ker5与我猜想的是相同的算法(haskell wiki中的算法，同样简洁)，我得到了一个在0.32秒(hyperboreus的c在8.5秒内运行)内运行的解决方案。所以，可以比较。我欣赏Mathematica的性能和简洁的解决方案，但最后我发现它没有合理地提取到另一种语言(或者有一个开放的实现，很明显)。情况变了吗？
@托马斯：不幸的是，我不知道。
对于haskell:-o2，结论给出了大约3x的加速，并使用int代替整数，大约4x-6x用于12x-14x和更多的总加速。
只是一个旁注-这里有一些其他有趣的哈斯克尔方法来解决同一个问题-见5和它的修订。滚动文本.org/project-euler-problem-12
如果你发现自己写的python比haskell慢，那可能是因为你自己对haskell缺乏经验。它可能并不总是比C快，但它通常非常接近。
@用户8174234你的意思是更快？
我想从以数学计算为中心的基准任务/代码中删除打印操作是不合理还是愚蠢？

利用GHC 7.0.3Linux 2.6.29在线安、gcc 4.4.6x86 64，_酷睿(2.5GHz)机，是利用ghc -O2 -fllvm -fforce-recompcompiling Haskell和gcc -O3 -lm是C.

在你的C程序运行的8.4秒(比你跑的快，可能是因为-O3)
解决方案：运行36秒(Haskell中，由于-O2旗)
你是不是已经明确factorCount'码型和defaulting到Integer(感谢丹尼尔在我的misdiagnosis是错误的！)给一个显式的类型签名。(这是标准的做法使用反正)和时间变化的Int11.1秒
你必须在factorCount'fromIntegralneedlessly称。在固定的方法(该编译器的变化虽然是智能的，幸运的是你)。
你是在哪里使用modrem更快的和足够的。这种变化的时间8.5秒。
额外的参数将是一个factorCount'不断变化(这是不number，sqrt)。该变换使美国工人/包装：

1
2
3
4
5
6

$ time ./so
842161320

real 0m7.954s
user 0m7.944s
sys 0m0.004s

是的，1.0秒。第二个consistently比C更快的解决方案。"我不-fllvm旗仍然要8.182 seconds后端的SCN，操作系统是在这个案例很好做。

结论：Haskell是可怕的。

由此产生的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

factorCount number = factorCount' number isquare 1 0 - (fromEnum $ square == fromIntegral isquare)
where square = sqrt $ fromIntegral number
isquare = floor square

factorCount' :: Int -> Int -> Int -> Int -> Int
factorCount' number sqrt candidate0 count0 = go candidate0 count0
where
go candidate count
| candidate > sqrt = count
| number `rem` candidate == 0 = go (candidate + 1) (count + 2)
| otherwise = go (candidate + 1) count

nextTriangle index triangle
| factorCount triangle > 1000 = triangle
| otherwise = nextTriangle (index + 1) (triangle + index + 1)

main = print $ nextTriangle 1 1

编辑：所以现在我们看到的是我们explored，地址的问题

Question 1: Do erlang, python and haskell lose speed due to using
arbitrary length integers or don't they as long as the values are less
than MAXINT?

在Haskell中，利用Integer是慢慢但比Int多少取决于computations在线版。luckily(64位机Int)是足够的。你应该是我的缘故可能携带或使用Int64重写代码Word64(c是不是唯一的语言与long)。

Question 2: Why is haskell so slow? Is there a compiler flag that
turns off the brakes or is it my implementation? (The latter is quite
probable as haskell is a book with seven seals to me.)

Question 3: Can you offer me some hints how to optimize these
implementations without changing the way I determine the factors?
Optimization in any way: nicer, faster, more"native" to the language.

这是我在说什么。答案是

通过使用优化的-O2(0)
(1)使用，尤其是：能快速unbox)时，可能的类型。
(2)不modrem常被遗忘和优化)
(3)工人变换/包装：最常见的优化)。

Question 4: Do my functional implementations permit LCO and hence
avoid adding unnecessary frames onto the call stack?

是的，这是不是美国的问题。你认为这个很好的工作和很高兴。

相关讨论

为什么rem和mod的速度有这么大的差别？
@卡尔，因为rem实际上是mod操作的一个子组件(它们不同)。如果您在GHC基本库中查看，您会看到mod测试有几种情况，并相应地调整符号。(见Base.lhs中的modInt#)
另一个数据点：我写了一个C程序的快速haskell翻译，而不看@hyperboreus的haskell。所以它有点接近标准的惯用haskell，我故意添加的唯一优化是在阅读完这个答案(heh，oops)后用rem替换mod。查看我的计时链接，但短版本"几乎与C完全相同"。
奇怪的事情是过去时。在我的机器(phenomx4，内核2.6.38)上，使用ghc 7.0.3(从源代码构建)，使用-O2 -fllvm -fforce-recomp编译它，将运行时间从2:37更改为2:33。当我复制并粘贴优化后的代码时，时间会增加到2:34。我不明白？另一件事：如何在Erlang中实现这个worker/wrapper转换？
看起来Int足以得到正确的答案(c解决方案使用long，这是Int的同义词)。另外，在我的机器(Core 2 Duo 2.4GHz Ubuntu x86)上，该解决方案在2.7s内运行，使用Int，18 s内运行，使用Integer，33 s内运行，使用int64。@hyperboreus，也许你在运行一个错误的可执行文件？
即使我觉得C版在我的机器上运行得更快，我现在对哈斯克尔有了新的尊重。+ 1
@hyperboreus我建议您在irc.freenode.net上加入haskell-stackoverflow注释对于解决由不同编译器/机器配置引起的问题是非常糟糕的。
@赛斯，这很有趣。我确实尝试将const添加到正确的变量中，并将llvm用于C程序，但在不更改算法的情况下，性能没有任何显著变化。
这对我来说是相当令人惊讶的，尽管我还没有尝试过。由于最初的factorCount'是tail递归的，所以我认为编译器可以发现没有更改的额外参数，并且只针对更改的参数优化tail递归(haskell毕竟是一种纯语言，这应该很容易)。有人认为编译器可以做到这一点，或者我应该回去读更多的理论论文吗？
@奇兹2：有一张GHC的票要加。根据我所了解的，这个转换可以导致额外的闭包对象分配。这在某些情况下意味着更差的性能，但是正如JohanTibell在他的博客文章中建议的那样，如果结果包装可以内联的话，就可以避免这种情况。
@Kizzx2：正如Haskell中许多简单而明显的优化一样，困难不在于如何进行优化，而在于何时进行优化。不可靠的优化有时会使事情变得更糟，这比根本没有优化更严重。
为什么非专门化的多态函数会使代码慢3倍？？
实际上，运行的原始代码不是多态的，即使在没有优化的情况下编译(使用ghc-7.*；它是多态的，没有优化的情况下使用6.12.3.-使用优化的情况下，所有代码都是单态的)。它使用没有类型签名的Integer(count除外，fromEnum的Int除外)，因为除了main外，没有任何东西是出口的。如果强制使用多态代码，那会比Integer慢很多(对我来说大约是2&215)。
@丹尼尔菲舍尔的好成绩。回想起来，我不确定是什么让我认为这是一个专门化的问题(这没有意义——正如我困惑的评论所说)。我很惊讶有人花了这么长时间才纠正我！
将原来的C程序中的long改为Int，使其运行速度提高两倍(产生正确的结果)。不幸的是，在这种情况下，haskell并不比c快。
@当然，您可以继续优化任何语言的实现和编译器。例如，请参见Raedwulf的答案。
哎呀。。没注意到：)
这条线是典型的草人。我们看到了几个未优化的代码，每个人都从他们的阵营中获取代码，优化它，然后看它是外面的快速语言。上面的答案错误地声称haskell是最快的，接下来可笑地说python是最快的，然后有人给出了一个实际优化的C代码，比上面的答案haskell更好。甚至有评论说Mathematica是最快的…
你提出了一个很好的观点——任何人都不应该对一个微基准进行过多的阅读，或者就此而言，根据语言的性能对其价值进行过多的阅读。上面我只声称结果haskell比asker提供的C实现运行得更快。我还对优化的C实现发表了评论，提到当时我无法生成性能类似的haskell，并对使用不同算法的Mathematica解决方案发表了评论，该算法同时违背了问题的本质，同时也强调了正确的答案。
哈斯克尔很可能是最适合我的思维方式的语言：)谢谢你证明它与C一样！
@Novice，所以有空间找到一个比较所有高度优化版本的答案？

Erlang的实现存在一些问题。作为下面的基线，我测量的未修改erlang程序的执行时间是47.6秒，而C代码的执行时间是12.7秒。

如果要运行计算密集型的Erlang代码，首先应该使用本机代码。用erlc +native euler12编译的时间降到了41.3秒。然而，这比在这种代码上进行本机编译的速度要慢得多(仅15%)，问题是您使用了-compile(export_all)。这对实验很有用，但是所有函数都可能从外部访问这一事实使得本机编译器非常保守。(普通光束仿真器没有太大的影响。)用-export([solve/0]).替换这个声明可以得到更好的加速：31.5秒(几乎是基线的35%)。

但代码本身有一个问题：对于factorCount循环中的每个迭代，都要执行以下测试：

1	factorCount (_, Sqrt, Candidate, Count) when Candidate == Sqrt -> Count + 1;

C代码不会这样做。一般来说，在同一代码的不同实现之间进行公平比较是很困难的，特别是如果算法是数字的，因为您需要确保它们实际上在做相同的事情。在一个实现中，由于某种类型转换而导致的轻微舍入错误可能会导致它比另一个执行更多的迭代，即使两个最终都达到相同的结果。

为了消除这个可能的错误源(并在每次迭代中去掉额外的测试)，我按照下面的方式重新编写了factorCount函数，它是在C代码上紧密建模的：

1
2
3
4
5
6
7
8
9
10
11
12
13

factorCount (N) ->
Sqrt = math:sqrt (N),
ISqrt = trunc(Sqrt),
if ISqrt == Sqrt -> factorCount (N, ISqrt, 1, -1);
true -> factorCount (N, ISqrt, 1, 0)
end.

factorCount (_N, ISqrt, Candidate, Count) when Candidate > ISqrt -> Count;
factorCount ( N, ISqrt, Candidate, Count) ->
case N rem Candidate of
0 -> factorCount (N, ISqrt, Candidate + 1, Count + 2);
_ -> factorCount (N, ISqrt, Candidate + 1, Count)
end.

这个重写(没有export_all和本机编译)给了我以下运行时间：

1
2
3
4
5
6
7

$ erlc +native euler12.erl
$ time erl -noshell -s euler12 solve
842161320

real 0m19.468s
user 0m19.450s
sys 0m0.010s

与C代码相比还不错：

1
2
3
4
5
6

$ time ./a.out
842161320

real 0m12.755s
user 0m12.730s
sys 0m0.020s

考虑到Erlang根本不适合编写数字代码，在这样的程序中，比C慢50%是相当不错的。

最后，关于你的问题：

问题1：由于使用任意长度整数或它们不是只要值小于maxint吗？

是的，有点。在Erlang中，没有办法说"使用32/64位带环绕的算术"，因此，除非编译器能够证明整数上的某些边界(通常不能证明)，否则它必须检查所有的计算，以查看它们是否适合于单个带标记的字，或者是否必须将它们转换为堆分配的Bignums。即使在运行时实践中从未使用过bignums，也必须执行这些检查。另一方面，这意味着你知道如果你突然给一个比以前更大的输入，算法永远不会因为一个意想不到的整数被包围而失败。

问题4：我的函数实现是否允许LCO，从而避免在调用堆栈中添加不必要的帧？

是的，您的Erlang代码对于上次调用优化是正确的。

相关讨论

关于python优化，除了使用pypy(对于代码零更改的显著加速)，您还可以使用pypy的翻译工具链编译一个rpython兼容版本，或者使用cython构建一个扩展模块，这两个模块在我的测试中都比C版本快，cython模块几乎是f的两倍。AST。作为参考，我还包括C和Pypy基准测试结果：

C(用gcc -O3 -lm编译)

1
2
3
4
5
6
7

% time ./euler12-c
842161320

./euler12-c 11.95s
user 0.00s
system 99%
cpu 11.959 total

PyPy 1.5

1
2
3
4
5
6

% time pypy euler12.py
842161320
pypy euler12.py
16.44s user
0.01s system
99% cpu 16.449 total

rpython(使用最新的pypy版本，c2f583445aee)

1
2
3
4
5
6

% time ./euler12-rpython-c
842161320
./euler12-rpy-c
10.54s user 0.00s
system 99%
cpu 10.540 total

Cython 0.15

1
2
3
4
5
6

% time python euler12-cython.py
842161320
python euler12-cython.py
6.27s user 0.00s
system 99%
cpu 6.274 total

rpython版本有几个关键更改。要转换成独立程序，您需要定义您的target，在本例中是main函数。它应该接受sys.argv作为它的唯一论据，并且需要返回一个int。您可以使用translate.py、% translate.py euler12-rpython.py来翻译它，后者翻译为c并为您编译它。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

# euler12-rpython.py

import math, sys

def factorCount(n):
square = math.sqrt(n)
isquare = int(square)
count = -1 if isquare == square else 0
for candidate in xrange(1, isquare + 1):
if not n % candidate: count += 2
return count

def main(argv):
triangle = 1
index = 1
while factorCount(triangle) < 1001:
index += 1
triangle += index
print triangle
return 0

if __name__ == '__main__':
main(sys.argv)

def target(*args):
return main, None

cython版本被重写为扩展模块_euler12.pyx，我从一个普通的python文件导入和调用它。_euler12.pyx本质上与您的版本相同，带有一些额外的静态类型声明。py具有使用python setup.py build_ext --inplace构建扩展的常规样板文件。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

# _euler12.pyx
from libc.math cimport sqrt

cdef int factorCount(int n):
cdef int candidate, isquare, count
cdef double square
square = sqrt(n)
isquare = int(square)
count = -1 if isquare == square else 0
for candidate in range(1, isquare + 1):
if not n % candidate: count += 2
return count

cpdef main():
cdef int triangle = 1, index = 1
while factorCount(triangle) < 1001:
index += 1
triangle += index
print triangle

# euler12-cython.py
import _euler12
_euler12.main()

# setup.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

ext_modules = [Extension("_euler12", ["_euler12.pyx"])]

setup(
name = 'Euler12-Cython',
cmdclass = {'build_ext': build_ext},
ext_modules = ext_modules
)

老实说，我对rpython和cython都没有太多经验，对结果感到惊喜。如果您使用的是cpython，那么在cython扩展模块中编写CPU密集型代码似乎是优化程序的一种非常简单的方法。

相关讨论

Question 3: Can you offer me some hints how to optimize these implementations
without changing the way I determine the factors? Optimization in any
way: nicer, faster, more"native" to the language.

C实现是次优的(如ThomasM.Dubuisson所暗示的那样)，版本使用64位整数(即长数据类型)。稍后我将研究程序集列表，但是有一个有根据的猜测，编译后的代码中有一些内存访问正在进行，这使得使用64位整数的速度明显减慢。这就是或生成的代码(您可以在SSE寄存器中容纳少于64位的整数，或者将双精度数舍入到64位整数，这样做会比较慢)。

下面是修改后的代码(只需将long替换为int，而我显式地内联factorcount，尽管我认为gcc-o3不需要这样做)：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

#include <stdio.h>
#include <math.h>

static inline int factorCount(int n)
{
double square = sqrt (n);
int isquare = (int)square;
int count = isquare == square ? -1 : 0;
int candidate;
for (candidate = 1; candidate <= isquare; candidate ++)
if (0 == n % candidate) count += 2;
return count;
}

int main ()
{
int triangle = 1;
int index = 1;
while (factorCount (triangle) < 1001)
{
index++;
triangle += index;
}
printf ("%d
", triangle);
}

跑步+计时：

1
2
3

$ gcc -O3 -lm -o euler12 euler12.c; time ./euler12
842161320
./euler12 2.95s user 0.00s system 99% cpu 2.956 total

作为参考，托马斯在前面的答案中的haskell实现给出了：

1
2
3
4
5

$ ghc -O2 -fllvm -fforce-recomp euler12.hs; time ./euler12 [9:40]
[1 of 1] Compiling Main ( euler12.hs, euler12.o )
Linking euler12 ...
842161320
./euler12 9.43s user 0.13s system 99% cpu 9.602 total

结论：虽然ghc是一个很好的编译器，但是gcc通常生成更快的代码。

相关讨论

看看这个博客。在过去的一年左右，他在haskell和python中做了一些项目euler问题，他通常发现haskell更快。我认为，在这些语言之间，它与您的流畅性和编码风格有更多的关系。

当谈到Python速度时，您使用的是错误的实现！试试派比，像这样的事情你会发现它要快得多。

相关讨论

通过使用haskell包中的一些函数，可以大大加快haskell实现的速度。在本例中，我使用的是primes，它只是与"cabal install primes"一起安装的；)

1
2
3
4
5
6
7
8
9

import Data.Numbers.Primes
import Data.List

triangleNumbers = scanl1 (+) [1..]
nDivisors n = product $ map ((+1) . length) (group (primeFactors n))
answer = head $ filter ((> 500) . nDivisors) triangleNumbers

main :: IO ()
main = putStrLn $"First triangle number to have over 500 divisors:" ++ (show answer)

计时：

您的原始程序：

1
2
3
4

PS> measure-command { bin\012_slow.exe }

TotalSeconds : 16.3807409
TotalMilliseconds : 16380.7409

改进的实施

1
2
3
4

PS> measure-command { bin\012.exe }

TotalSeconds : 0.0383436
TotalMilliseconds : 38.3436

如你所见，这台机器在38毫秒内运行，而你的机器在16秒内运行。

编译命令：

1 2	ghc -O2 012.hs -o bin\012.exe ghc -O2 012_slow.hs -o bin\012_slow.exe

相关讨论

只是为了好玩。以下是更"本机"的haskell实现：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

import Control.Applicative
import Control.Monad
import Data.Either
import Math.NumberTheory.Powers.Squares

isInt :: RealFrac c => c -> Bool
isInt = (==) <$> id <*> fromInteger . round

intSqrt :: (Integral a) => a -> Int
--intSqrt = fromIntegral . floor . sqrt . fromIntegral
intSqrt = fromIntegral . integerSquareRoot'

factorize :: Int -> [Int]
factorize 1 = []
factorize n = first : factorize (quot n first)
where first = (!! 0) $ [a | a <- [2..intSqrt n], rem n a == 0] ++ [n]

factorize2 :: Int -> [(Int,Int)]
factorize2 = foldl (\ls@((val,freq):xs) y -> if val == y then (val,freq+1):xs else (y,1):ls) [(0,0)] . factorize

numDivisors :: Int -> Int
numDivisors = foldl (\acc (_,y) -> acc * (y+1)) 1 <$> factorize2

nextTriangleNumber :: (Int,Int) -> (Int,Int)
nextTriangleNumber (n,acc) = (n+1,acc+n+1)

forward :: Int -> (Int, Int) -> Either (Int, Int) (Int, Int)
forward k val@(n,acc) = if numDivisors acc > k then Left val else Right (nextTriangleNumber val)

problem12 :: Int -> (Int, Int)
problem12 n = (!!0) . lefts . scanl (>>=) (forward n (1,1)) . repeat . forward $ n

main = do
let (n,val) = problem12 1000
print val

使用ghc -O3，这在我的机器(1.73GHz核心i7)上持续运行0.55-0.58秒。

对于C版本，一个更有效的factorCount函数：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

int factorCount (int n)
{
int count = 1;
int candidate,tmpCount;
while (n % 2 == 0) {
count++;
n /= 2;
}
for (candidate = 3; candidate < n && candidate * candidate < n; candidate += 2)
if (n % candidate == 0) {
tmpCount = 1;
do {
tmpCount++;
n /= candidate;
} while (n % candidate == 0);
count*=tmpCount;
}
if (n > 1)
count *= 2;
return count;
}

使用gcc -O3 -lm将long改为ints，持续运行0.31-0.35秒。

如果利用第n个三角形数=n*(n+1)/2和n和(n+1)具有完全不同的素因式分解这一事实，则可以使这两者运行得更快，因此可以将每一半的因子数相乘以找到整体的因子数。下列内容：

1
2
3
4
5
6
7
8
9
10

int main ()
{
int triangle = 0,count1,count2 = 1;
do {
count1 = count2;
count2 = ++triangle % 2 == 0 ? factorCount(triangle+1) : factorCount((triangle+1)/2);
} while (count1*count2 < 1001);
printf ("%lld
", ((long long)triangle)*(triangle+1)/2);
}

将C代码的运行时间减少到0.17-0.19秒，它可以处理更大的搜索——在我的机器上，超过10000个因子需要43秒。我给感兴趣的读者留了一份类似的哈斯克尔快报。

相关讨论

Question 1: Do erlang, python and haskell loose speed due to using arbitrary length integers or don't they as long as the values are less than MAXINT?

这不太可能。我不能说太多关于Erlang和Haskell的事情(好吧，也许下面有点关于Haskell)，但我可以指出Python中的许多其他瓶颈。每次程序试图在python中使用某些值执行操作时，它都应该验证这些值是否来自正确的类型，并且需要花费一些时间。你的factorCount函数只是用range (1, isquare + 1)分配一个列表不同的时间，运行时，malloc样式的内存分配比用计数器在一个范围内迭代慢得多，就像在C语言中那样。值得注意的是，factorCount()被多次调用，因此分配了很多列表。另外，让我们不要忘记python是被解释的，而cpython解释器并没有很好地关注优化。

编辑：哦，我注意到您使用的是python 3，所以range()不返回列表，而是一个生成器。在这种情况下，我关于分配列表的观点有一半是错误的：函数只分配range对象，虽然效率很低，但没有分配包含大量项目的列表那么低。

Question 2: Why is haskell so slow? Is there a compiler flag that turns off the brakes or is it my implementation? (The latter is quite probable as haskell is a book with seven seals to me.)

你在拥抱吗？拥抱是一个相当慢的口译员。如果你正在使用它，也许你可以更好地使用ghc-但我只是在考虑低血压，一个好的haskell编译器在幕后所做的事情是非常吸引人的，远远超出我的理解范围：)

Question 3: Can you offer me some hints how to optimize these implementations without changing the way I determine the factors? Optimization in any way: nicer, faster, more"native" to the language.

我会说你在玩一个不起眼的游戏。了解各种语言的最好部分是尽可能以最不同的方式使用它们。)但我离题了，我对这一点没有任何建议。很抱歉，我希望有人能在这种情况下帮助您：)

Question 4: Do my functional implementations permit LCO and hence avoid adding unnecessary frames onto the call stack?

据我所知，您只需要确保递归调用是返回值之前的最后一个命令。换句话说，像下面这样的函数可以使用这种优化：

1
2
3
4
5
6
7

def factorial(n, acc=1):
if n > 1:
acc = acc * n
n = n - 1
return factorial(n, acc)
else:
return acc

但是，如果您的函数像下面的函数那样，则不会有这样的优化，因为在递归调用之后有一个操作(乘法)：

1
2
3
4
5
6

def factorial2(n):
if n > 1:
f = factorial2(n-1)
return f*n
else:
return 1

我在一些局部变量中分离了操作，以便清楚地了解执行哪些操作。然而，最常见的是如下所示的这些函数，但它们与我提出的观点是等效的：

1
2
3
4
5
6
7
8
9
10
11

def factorial(n, acc=1):
if n > 1:
return factorial(n-1, acc*n)
else:
return acc

def factorial2(n):
if n > 1:
return n*factorial(n-1)
else:
return 1

注意，由编译器/解释器决定是否进行尾部递归。例如，如果我记得很好，python解释器就不会这样做(我在示例中使用python只是因为它的语法流畅)。不管怎样，如果你发现一些奇怪的东西，比如有两个参数的阶乘函数(其中一个参数的名字是acc、accumulator等等)，现在你知道人们为什么要这样做了。

相关讨论

有了Haskell，您真的不需要显式地考虑递归。

1
2
3
4
5
6
7
8
9
10
11
12
13

factorCount number = foldr factorCount' 0 [1..isquare] -
(fromEnum $ square == fromIntegral isquare)
where
square = sqrt $ fromIntegral number
isquare = floor square
factorCount' candidate
| number `rem` candidate == 0 = (2 +)
| otherwise = id

triangles :: [Int]
triangles = scanl1 (+) [1,2..]

main = print . head $ dropWhile ((< 1001) . factorCount) triangles

在上面的代码中，我用公共列表操作替换了@thomas'answer中的显式递归。代码仍然做同样的事情，而不必担心尾部递归。它的运行速度(~7.49s)比我的机器上的@thomas'answer(~7.04s)版本慢了大约6%，而@raedwulf的C版本运行速度~3.15s。似乎GHC在一年中有所改善。

我知道这是一个古老的问题，我在谷歌搜索中偶然发现了它(我忘记了我正在搜索的内容，现在…)。只是想对LCO的问题发表评论，表达我对哈斯克尔的总体感受。我想对最上面的答案发表评论，但是评论不允许代码块。

查看您的Erlang实现。时间安排包括启动整个虚拟机、运行程序和停止虚拟机。我很确定设置和停止Erlang虚拟机需要一些时间。

如果计时是在Erlang虚拟机本身内完成的，那么结果会有所不同，因为在这种情况下，我们将只对相关程序有实际时间。否则，我相信启动和加载Erlang虚拟机的过程所花费的总时间加上停止该虚拟机的过程所花费的总时间(如您在程序中所说)都包含在您用于为程序输出计时的方法所花费的总时间中。考虑使用Erlang计时本身，当我们想在虚拟机本身中对程序计时时，可以使用它。timer:tc/1 or timer:tc/2 or timer:tc/3。这样，Erlang的结果将排除启动和停止/终止/停止虚拟机所需的时间。这是我的推理，想一想，然后再试试你的基准。

实际上，我建议我们尝试在这些语言的运行时内为程序计时(对于具有运行时的语言)，以便获得精确的值。例如，C没有启动和关闭运行时系统的开销，erlang、python和haskell也没有(98%的人相信这一点——我接受了修正)。因此(基于这种推理)，我的结论是，对于运行在运行时系统之上的语言，这个基准不够精确/公平。让我们用这些更改再做一次。

编辑：此外，即使所有语言都有运行时系统，启动和停止每种语言的开销也会有所不同。所以我建议我们从运行时系统内开始计时(对于适用于此的语言)。众所周知，Erlang虚拟机在启动时有相当大的开销！

相关讨论

关于C版的更多数字和解释。显然，在这些年里没有人做过。记住要把这个答案加上投票，这样每个人都能看到并学习它。

第一步：作者计划的基准

笔记本电脑规格：

CPU I3 M380(931 MHz-最大节电模式)
4GB内存
Wi7 64位
Microsoft Visual Studio 2012旗舰版
Cygwin和GCC 4.9.3
Python 2.7.10

命令：

1
2
3

compiling on VS x64 command prompt > `for /f %f in ('dir /b *.c') do cl /O2 /Ot /Ox %f -o %f_x64_vs2012.exe`
compiling on cygwin with gcc x64 > `for f in ./*.c; do gcc -m64 -O3 $f -o ${f}_x64_gcc.exe ; done`
time (unix tools) using cygwin > `for f in ./*.exe; do echo"----------"; echo $f ; time $f ; done`

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

----------
$ time python ./original.py

real 2m17.748s
user 2m15.783s
sys 0m0.093s
----------
$ time ./original_x86_vs2012.exe

real 0m8.377s
user 0m0.015s
sys 0m0.000s
----------
$ time ./original_x64_vs2012.exe

real 0m8.408s
user 0m0.000s
sys 0m0.015s
----------
$ time ./original_x64_gcc.exe

real 0m20.951s
user 0m20.732s
sys 0m0.030s

文件名为：integertype_architecture_compiler.exe。

integertype目前与原始程序相同(稍后将详细介绍)
体系结构为x86或x64，具体取决于编译器设置
编译器是gcc或vs2012

第二步：再次调查、改进和基准测试

Vs比GCC快250%。两个编译器的速度应该相似。显然，代码或编译器选项有问题。让我们调查一下！

第一个关注点是整数类型。转换成本很高，一致性对于更好的代码生成和优化很重要。所有整数应为同一类型。

这是一个混合混乱的int和long。我们会改进的。使用哪种类型？最快的。要给他们所有人做基准！

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

----------
$ time ./int_x86_vs2012.exe

real 0m8.440s
user 0m0.016s
sys 0m0.015s
----------
$ time ./int_x64_vs2012.exe

real 0m8.408s
user 0m0.016s
sys 0m0.015s
----------
$ time ./int32_x86_vs2012.exe

real 0m8.408s
user 0m0.000s
sys 0m0.015s
----------
$ time ./int32_x64_vs2012.exe

real 0m8.362s
user 0m0.000s
sys 0m0.015s
----------
$ time ./int64_x86_vs2012.exe

real 0m18.112s
user 0m0.000s
sys 0m0.015s
----------
$ time ./int64_x64_vs2012.exe

real 0m18.611s
user 0m0.000s
sys 0m0.015s
----------
$ time ./long_x86_vs2012.exe

real 0m8.393s
user 0m0.015s
sys 0m0.000s
----------
$ time ./long_x64_vs2012.exe

real 0m8.440s
user 0m0.000s
sys 0m0.015s
----------
$ time ./uint32_x86_vs2012.exe

real 0m8.362s
user 0m0.000s
sys 0m0.015s
----------
$ time ./uint32_x64_vs2012.exe

real 0m8.393s
user 0m0.015s
sys 0m0.015s
----------
$ time ./uint64_x86_vs2012.exe

real 0m15.428s
user 0m0.000s
sys 0m0.015s
----------
$ time ./uint64_x64_vs2012.exe

real 0m15.725s
user 0m0.015s
sys 0m0.015s
----------
$ time ./int_x64_gcc.exe

real 0m8.531s
user 0m8.329s
sys 0m0.015s
----------
$ time ./int32_x64_gcc.exe

real 0m8.471s
user 0m8.345s
sys 0m0.000s
----------
$ time ./int64_x64_gcc.exe

real 0m20.264s
user 0m20.186s
sys 0m0.015s
----------
$ time ./long_x64_gcc.exe

real 0m20.935s
user 0m20.809s
sys 0m0.015s
----------
$ time ./uint32_x64_gcc.exe

real 0m8.393s
user 0m8.346s
sys 0m0.015s
----------
$ time ./uint64_x64_gcc.exe

real 0m16.973s
user 0m16.879s
sys 0m0.030s

整数类型为intlongint32_tuint32_tint64_t和uint64_t来自#include 。

C中有许多整数类型，加上一些要使用的有符号/无符号类型，再加上要编译为x86或x64的选项(不要与实际整数大小混淆)。要编译和运行的版本很多^^

第三步：了解数字

决定性结论：

32位整数比64位等价物快约200%
无符号64位整数比有符号64位快25%(不幸的是，我对此没有解释)

技巧问题："C中int和long的大小是多少？"正确的答案是：int和long在c中的大小定义不明确！

根据C规范：

int is at least 32 bits
long is at least an int

< /块引用>
从gcc手册页(-m32和-m64标志)：

The 32-bit environment sets int, long and pointer to 32 bits and generates code that runs on any i386 system.
The 64-bit environment sets int to 32 bits and long and pointer to 64 bits and generates code for AMD’s x86-64 architecture.

< /块引用>
来自msdn文档(数据类型范围)https://msdn.microsoft.com/en-us/library/s3f49ktz%28v=vs.110%29.aspx:

int, 4 bytes, also knows as signed
long, 4 bytes, also knows as long int and signed long int

< /块引用>总结一下：经验教训

32位整数比64位整数快。

标准整数类型在C和C++中没有很好的定义，它们根据编译器和体系结构而变化。当需要一致性和可预测性时，请使用#include 中的uint32_t整数族。

速度问题解决了。所有其他语言都落后百分之一百，C＆AMC+C+再次获胜！他们总是这样。下一个改进将是使用openmp:d进行多线程处理

相关讨论

出于好奇，英特尔编译器做得怎么样？他们通常非常擅长优化数字代码。

你在哪里找到一个引用说C规范保证"int至少是32位"？我所知道的唯一保证是INT_MIN和INT_MAX(32767和32767)的最小幅度，这实际上要求int至少为16位。要求long至少与int一样大，并且范围要求意味着long至少为32位。

你看起来是对的。stackoverflow.com/questions/1231147/is-in t-in-c-always-32-bi&zwnj；&8203；t

Question 1: Do Erlang, Python and Haskell lose speed due to using
arbitrary length integers or don't they as long as the values are less
than MAXINT?

问题一可以用二郎的否定回答。最后一个问题的答案是适当地使用erlang，如：
http://bredsaal.dk/learning-erlang-using-projecteuler-net
因为它比您最初的C示例要快，所以我猜会有许多问题，因为其他人已经详细讨论过了。
这个Erlang模块在5秒钟内在一个便宜的上网本上执行…它使用了Erlang中的网络线程模型，因此演示了如何利用事件模型。它可以分布在许多节点上。而且很快。不是我的代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
-module(p12dist).
-author("Jannich Brendle, [email protected], http://blog.bredsaal.dk").
-compile(export_all).

server() ->
server(1).

server(Number) ->
receive {getwork, Worker_PID} -> Worker_PID ! {work,Number,Number+100},
server(Number+101);
{result,T} -> io:format("The result is: \~w.\~n", [T]);
_ -> server(Number)
end.

worker(Server_PID) ->
Server_PID ! {getwork, self()},
receive {work,Start,End} -> solve(Start,End,Server_PID)
end,
worker(Server_PID).

start() ->
Server_PID = spawn(p12dist, server, []),
spawn(p12dist, worker, [Server_PID]),
spawn(p12dist, worker, [Server_PID]),
spawn(p12dist, worker, [Server_PID]),
spawn(p12dist, worker, [Server_PID]).

solve(N,End,_) when N =:= End -> no_solution;

solve(N,End,Server_PID) ->
T=round(N*(N+1)/2),
case (divisor(T,round(math:sqrt(T))) > 500) of
true ->
Server_PID ! {result,T};
false ->
solve(N+1,End,Server_PID)
end.

divisors(N) ->
divisor(N,round(math:sqrt(N))).

divisor(_,0) -> 1;
divisor(N,I) ->
case (N rem I) =:= 0 of
true ->
2+divisor(N,I-1);
false ->
divisor(N,I-1)
end.

下面的测试在：Intel(R)Atom(TM)CPU [email protected]上进行。

1
2
3
4
5
6
7
8
9
10
11
12
13
~$ time erl -noshell -s p12dist start

The result is: 76576500.

^C

BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded
(v)ersion (k)ill (D)b-tables (d)istribution
a

real 0m5.510s
user 0m5.836s
sys 0m0.152s

相关讨论

按如下方式将值增加到1000并不能获得正确的结果。最新测试：IntelCore2 CPU [email protected]，实际0平方米370秒。

你的结果：76576500其他人：842161320你的结果有点错误

因为我是董一些其他欧拉问题，我只是检查了我的结果。projecteuler.net/problem=12的答案是76576500，这是毫无疑问的。我知道这看起来很奇怪，但我刚检查过。

相比之下，我得到9.03与原始的C版本，而使用Erlang19与Mark的代码，我得到5.406，167.0366%更快。

尝试去：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
package main

import"fmt"
import"math"

func main() {
var n, m, c int
for i := 1; ; i++ {
n, m, c = i * (i + 1) / 2, int(math.Sqrt(float64(n))), 0
for f := 1; f < m; f++ {
if n % f == 0 { c++ }
}
c *= 2
if m * m == n { c ++ }
if c > 1001 {
fmt.Println(n)
break
}
}
}

我得到：
原C版：9.1690 100%去：8.2520 111%
但使用：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package main

import (
"math"
"fmt"
)

// Sieve of Eratosthenes
func PrimesBelow(limit int) []int {
switch {
case limit < 2:
return []int{}
case limit == 2:
return []int{2}
}
sievebound := (limit - 1) / 2
sieve := make([]bool, sievebound+1)
crosslimit := int(math.Sqrt(float64(limit))-1) / 2
for i := 1; i <= crosslimit; i++ {
if !sieve[i] {
for j := 2 * i * (i + 1); j <= sievebound; j += 2*i + 1 {
sieve[j] = true
}
}
}
plimit := int(1.3*float64(limit)) / int(math.Log(float64(limit)))
primes := make([]int, plimit)
p := 1
primes[0] = 2
for i := 1; i <= sievebound; i++ {
if !sieve[i] {
primes[p] = 2*i + 1
p++
if p >= plimit {
break
}
}
}
last := len(primes) - 1
for i := last; i > 0; i-- {
if primes[i] != 0 {
break
}
last = i
}
return primes[0:last]
}

func main() {
fmt.Println(p12())
}
// Requires PrimesBelow from utils.go
func p12() int {
n, dn, cnt := 3, 2, 0
primearray := PrimesBelow(1000000)
for cnt <= 1001 {
n++
n1 := n
if n1%2 == 0 {
n1 /= 2
}
dn1 := 1
for i := 0; i < len(primearray); i++ {
if primearray[i]*primearray[i] > n1 {
dn1 *= 2
break
}
exponent := 1
for n1%primearray[i] == 0 {
exponent++
n1 /= primearray[i]
}
if exponent > 1 {
dn1 *= exponent
}
if n1 == 1 {
break
}
}
cnt = dn * dn1
dn = dn1
}
return n * (n - 1) / 2
}

我得到：
原C版：9.1690 100%Thaumkid的C版本：0.1060 8650%第一次通过版本：8.2520 111%第二版：0.0230 39865%
我还尝试了python3.6和pypy3.3-5.5-alpha：
原C版：8.629 100%Thaumkid的C版：0.109 7916%Python3.6:54.795 16%PYPY3.3-5.5-α：13.291 65%
然后用下面的代码我得到：
原C版：8.629 100%Thaumkid的C版：0.109 8650%Python3.6:1.489 580%PYPY3.3-5.5-α：0.582 1483%

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def D(N):
if N == 1: return 1
sqrtN = int(N ** 0.5)
nf = 1
for d in range(2, sqrtN + 1):
if N % d == 0:
nf = nf + 1
return 2 * nf - (1 if sqrtN**2 == N else 0)

L = 1000
Dt, n = 0, 0

while Dt <= L:
t = n * (n + 1) // 2
Dt = D(n/2)*D(n+1) if n%2 == 0 else D(n)*D((n+1)/2)
n = n + 1

print (t)

C++ 11，<20Ms，运行在这里
我知道你想要一些提示来帮助提高你的语言特定知识，但是由于这里已经很好地介绍了这一点，我想我应该为那些可能看过关于你问题的Mathematica评论等的人添加一些上下文，并且想知道为什么这个代码慢得多。
这个答案主要是提供上下文，希望帮助人们更容易地评估您的问题/其他答案中的代码。
此代码仅使用两个(丑陋的)优化，与使用的语言无关，基于：

每个三角形编号的形式为n(n+1)/2。

n和n+1是互质的

除数是一个乘法函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <iostream>
#include <cmath>
#include <tuple>
#include <chrono>

using namespace std;

// Calculates the divisors of an integer by determining its prime factorisation.

int get_divisors(long long n)
{
int divisors_count = 1;

for(long long i = 2;
i <= sqrt(n);
/* empty */)
{
int divisions = 0;
while(n % i == 0)
{
n /= i;
divisions++;
}

divisors_count *= (divisions + 1);

//here, we try to iterate more efficiently by skipping
//obvious non-primes like 4, 6, etc
if(i == 2)
i++;
else
i += 2;
}

if(n != 1) //n is a prime
return divisors_count * 2;
else
return divisors_count;
}

long long euler12()
{
//n and n + 1
long long n, n_p_1;

n = 1; n_p_1 = 2;

// divisors_x will store either the divisors of x or x/2
// (the later iff x is divisible by two)
long long divisors_n = 1;
long long divisors_n_p_1 = 2;

for(;;)
{
/* This loop has been unwound, so two iterations are completed at a time
* n and n + 1 have no prime factors in common and therefore we can
* calculate their divisors separately
*/

long long total_divisors; //the divisors of the triangle number
// n(n+1)/2

//the first (unwound) iteration

divisors_n_p_1 = get_divisors(n_p_1 / 2); //here n+1 is even and we

total_divisors =
divisors_n
* divisors_n_p_1;

if(total_divisors > 1000)
break;

//move n and n+1 forward
n = n_p_1;
n_p_1 = n + 1;

//fix the divisors
divisors_n = divisors_n_p_1;
divisors_n_p_1 = get_divisors(n_p_1); //n_p_1 is now odd!

//now the second (unwound) iteration

total_divisors =
divisors_n
* divisors_n_p_1;

if(total_divisors > 1000)
break;

//move n and n+1 forward
n = n_p_1;
n_p_1 = n + 1;

//fix the divisors
divisors_n = divisors_n_p_1;
divisors_n_p_1 = get_divisors(n_p_1 / 2); //n_p_1 is now even!
}

return (n * n_p_1) / 2;
}

int main()
{
for(int i = 0; i < 1000; i++)
{
using namespace std::chrono;
auto start = high_resolution_clock::now();
auto result = euler12();
auto end = high_resolution_clock::now();

double time_elapsed = duration_cast<milliseconds>(end - start).count();

cout << result <<"" << time_elapsed << '
';
}
return 0;
}

我的台式电脑平均需要19毫秒左右，而笔记本电脑则需要80毫秒左右，这与我在这里看到的其他大多数代码相差甚远。毫无疑问，还有很多优化方案可供选择。

相关讨论

这是相当明确的不是询问者所要求的，"我真的试图在四种语言中实现尽可能相似的算法"。引用与你相似的许多已删除答案中的一个的评论，"很明显，无论语言如何，你都可以用更好的算法获得更快的速度。"

@托马斯·杜布森。这就是我要说的。这个问题答案严重地暗示了算法的加速是重要的(当然OP没有要求它们)，但是没有明确的例子。我认为这个答案——并不是经过严格优化的代码——为像我这样想知道操作代码有多慢/多快的人提供了一个有用的上下文。

+1.使用你的大脑而不是低水平的优化

GCC甚至可以预先计算许多模式。int a=0；for(int i=0；i<10000000；+i)a+=i；将在编译时计算，因此在运行时需要<1毫秒。它也很重要

@托马斯：我必须同意用户3125280的观点——应该比较语言在做一些聪明的事情时的表现，而不是在做一些愚蠢的事情时如何击败真正的编程语言。智能算法通常关心的不是微观效率，而是灵活性、连接(组合)东西和基础设施的能力。无论是20毫秒还是50毫秒，关键不在于它是8秒还是8分钟。

变更：case (divisor(T,round(math:sqrt(T))) > 500) of。
致：case (divisor(T,round(math:sqrt(T))) > 1000) of。
这将为Erlang多进程示例生成正确的答案。

相关讨论

这是作为对这个答案的评论吗？因为还不清楚，这本身就不是答案。

我假设只有当涉及的数字有许多小的因素时，因子的数量才是大的。所以我使用了Thaumkid的优秀算法，但首先使用了一个近似的因子计数，它永远不会太小。这很简单：检查最高达29的素数因子，然后检查剩余的数字，并计算因子nmber的上限。使用此函数可以计算因子数量的上限，如果该值足够高，则计算因子的确切数量。
下面的代码不需要这种假设来保证正确性，而是要快速。这似乎是可行的；只有大约十分之一的数字给出了一个足够高的估计，需要一个完整的检查。
代码如下：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
// Return at least the number of factors of n.
static uint64_t approxfactorcount (uint64_t n)
{
uint64_t count = 1, add;

#define CHECK(d) \
do { \
if (n % d == 0) { \
add = count; \
do { n /= d; count += add; } \
while (n % d == 0); \
} \
} while (0)

CHECK ( 2); CHECK ( 3); CHECK ( 5); CHECK ( 7); CHECK (11); CHECK (13);
CHECK (17); CHECK (19); CHECK (23); CHECK (29);
if (n == 1) return count;
if (n < 1ull * 31 * 31) return count * 2;
if (n < 1ull * 31 * 31 * 37) return count * 4;
if (n < 1ull * 31 * 31 * 37 * 37) return count * 8;
if (n < 1ull * 31 * 31 * 37 * 37 * 41) return count * 16;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43) return count * 32;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47) return count * 64;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47 * 53) return count * 128;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47 * 53 * 59) return count * 256;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47 * 53 * 59 * 61) return count * 512;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47 * 53 * 59 * 61 * 67) return count * 1024;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47 * 53 * 59 * 61 * 67 * 71) return count * 2048;
if (n < 1ull * 31 * 31 * 37 * 37 * 41 * 43 * 47 * 53 * 59 * 61 * 67 * 71 * 73) return count * 4096;
return count * 1000000;
}

// Return the number of factors of n.
static uint64_t factorcount (uint64_t n)
{
uint64_t count = 1, add;

CHECK (2); CHECK (3);

uint64_t d = 5, inc = 2;
for (; d*d <= n; d += inc, inc = (6 - inc))
CHECK (d);

if (n > 1) count *= 2; // n must be a prime number
return count;
}

// Prints triangular numbers with record numbers of factors.
static void printrecordnumbers (uint64_t limit)
{
uint64_t record = 30000;

uint64_t count1, factor1;
uint64_t count2 = 1, factor2 = 1;

for (uint64_t n = 1; n <= limit; ++n)
{
factor1 = factor2;
count1 = count2;

factor2 = n + 1; if (factor2 % 2 == 0) factor2 /= 2;
count2 = approxfactorcount (factor2);

if (count1 * count2 > record)
{
uint64_t factors = factorcount (factor1) * factorcount (factor2);
if (factors > record)
{
printf ("%lluth triangular number = %llu has %llu factors
", n, factor1 * factor2, factors);
record = factors;
}
}
}
}

这发现147530244个三角形的13824个因子在0.7秒内，879207615个三角形的61440个因子在34秒内，12524486975个三角形的138240个因子在10分5秒，26467792064个三角形的172032个因子在21分25秒(2.4GHZ core2 duo)，所以t他的代码平均每个数字只需要116个处理器周期。最后一个三角形数本身大于2^68，所以

我把"Jannich Brendle"版本改为1000而不是500。并列出euler12.bin、euler12.erl、p12dist.erl的结果。两个erl代码都使用"+native"进行编译。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
zhengs-MacBook-Pro:workspace zhengzhibin$ time erl -noshell -s p12dist start
The result is: 842161320.

real 0m3.879s
user 0m14.553s
sys 0m0.314s
zhengs-MacBook-Pro:workspace zhengzhibin$ time erl -noshell -s euler12 solve
842161320

real 0m10.125s
user 0m10.078s
sys 0m0.046s
zhengs-MacBook-Pro:workspace zhengzhibin$ time ./euler12.bin
842161320

real 0m5.370s
user 0m5.328s
sys 0m0.004s
zhengs-MacBook-Pro:workspace zhengzhibin$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#include <stdio.h>
#include <math.h>

int factorCount (long n)
{
double square = sqrt (n);
int isquare = (int) square+1;
long candidate = 2;
int count = 1;
while(candidate <= isquare && candidate<=n){
int c = 1;
while (n % candidate == 0) {
c++;
n /= candidate;
}
count *= c;
candidate++;
}
return count;
}

int main ()
{
long triangle = 1;
int index = 1;
while (factorCount (triangle) < 1001)
{
index ++;
triangle += index;
}
printf ("%ld
", triangle);
}

GCC-LM-OFAST欧拉.C
时间/A.OUT
2.79S用户0.00S系统99%CPU 2.794总计