问题描述
我试图找到每个符号的频率在使用O(n)的复杂的算法任何给定的文本。我的算法是这样的:
S = LEN(文本)
P = 1.0 /秒
freqs = {}
对于字符在文本:
尝试:
freqs [字符] + = P
除:
freqs [字符] = P
但我怀疑,这个字典的方法是速度不够快,因为它取决于字典方法的底层实现。这是最快的方法是什么?
更新:如果有收藏和使用整数是没有增加的速度。这是因为算法已经是O(n)的复杂性,因此没有必要的加速是可能的。
例如,结果1MB文本:
无类别:
真正0m0.695s
与集合:
真正0m0.625s
性能比较
注:时间在表中不包括它需要加载文件的时间
|方法|美国英语,| big.txt,|时间w.r.t. defaultdict |
| |时间,秒|时间,秒| |
| ---------------- + ------------------- + ------------ --- + ------------------------- |
|柜台| 0.451 | 3.367 | 3.6 |
| setdefault | 0.348 | 2.320 | 2.5 |
|名单| 0.277 | 1.822 | 2 |
|尝试/除| 0.158 | 1.068 | 1.2 |
| defaultdict | 0.141 | 0.925 | 1 |
| numpy的| 0.012 | 0.076 | 0.082 |
| S.Mark的分机。 | 0.003 | 0.019 | 0.021 |
|分机。在用Cython | 0.001 | 0.008 | 0.0086 |
#+ TBLFM:$ 4 = $ 3 / @ 7 $ 3;%2克。
使用的文件:的/ usr /共享/词典/美国英语
和。
这是比较'反','setdefault,清单的剧本,除了尝试/','defaultdict','numpy的','用Cython以诚为本,和@ S.Mark的解决方案是在的
最快的解决方法是Python扩展写在用Cython:
进口用Cython
@ cython.locals(
字符= UNI code,
I = cython.Py_ssize_t,
L = cython.Py_ssize_t [0x10000处])
高清countchars_cython(字符):
因为我的范围(0x10000处):#UNI code code点> 0xFFFF的不支持
L [I] = 0
对于C中的字符:
L [C] + = 1
返回{unichr(ⅰ):L [i]于为i的范围(为0x10000)如果L [I]}
previous比较:
*蟒蛇(字典):0.5秒
(如果读到内存中的整个文件0.2)0.5(ASCII):*蟒蛇(名单)
* perl的:0.5
*蟒蛇(numpy的):0.07
* C ++:0.05
* C:0.008(ASCII)
输入数据:
$尾部的/ usr /共享/词典/美国英语
怡亨的
锐气
义隆
流亡
流亡者
重剑
épées
练习曲
练习曲的
练习曲
$杜-h的/ usr /共享/词典/美国英语
912K的/ usr /共享/词典/美国英语
蟒蛇(计数器):0.5秒
#!的/ usr / bin中/ env的python3.1
进口藏品,的FileInput,textwrap
字符=(CH在fileinput.input字()的通道中word.rstrip())
#快(0.4秒),但不灵活:字符=打开(文件名).read()
打印(textwrap.fill(STR(collections.Counter(字)),宽= 79))
运行:
$时间-p python3.1 count_char.py的/ usr /共享/词典/美国英语
的计数器({E:87823,S:86620,I:66548,一:62778,N:56696,R: 56286,T:51588,O:48425,L:39914,C:30020,D:28068,U:25810, ':24511,'G':22262,'P':20917,的m:20747,的h:18453,'B':14137,'y'的: 12367,F:10049,K':7800,V:7573,W:6924,Z:3088,X:2082M: 1686,C:1549,S:1515,Q:1447,B:1387,J:1376,A:1345,P: 974,L:912,H:860,T:858,G:811,D:809,R:749,K:656,E: 618,J:539,N:531,W:507,F:502,O:354,I:344,V:330,Z: 150,Y:140,'E'128,'U'117,'Q':63,X:42,E:29,O:12,'U':12, O:10,A:10,'A':7,E:6,A:6,'N':6,C:4,'A':3,'U ':3,'我': 2,'O':2,'A':1}) 真正0.44 用户0.43 SYS 0.01
的Perl:0.5秒
时间-p perl的-MData ::自卸车-F'-lanwe'$ C {$ _} + +的(@F);
END {$数据::自卸车::简洁= 1; $数据::自卸车::缩进= 0;打印自卸车(\%C)}
在/ usr /共享/词典/美国英语
输出:
{'S'=> 1515,'K'=> 656','=> 29,D=> 28068,'Y'=> 140,'E'=> 618,Y '=> 12367,'G'=> 22262,电子=> 87823,''=> 2,J=> 539,''=> 241,''=> 3''=> 6, ''=> 4,''=> 128,D=> 809,Q=> 1447,'B'=> 14137,'Z'=> 3088,'W'=> 6924,Q => 63,''=> 10,M=> 1686,C=> 1549,''=> 10,L=> 912,X=> 42,P=> 974 ,''=> 12,'\''=> 24511,''=> 6,'一个'=> 62778,T=> 858,N=> 531,'J'=> 1376,' Z'=> 150,'U'=> 25810,'K'=> 7800,'T'=> 51588,=> 6,'W'=> 507,'V'=> 7573,'S' => 86620,B=> 1387,H=> 860,'C'=> 30020,''=> 12,I=> 344,''=> 3,G=> 811 ,U=> 117,F=> 502,''=> 2,'R'=> 56286,'×'=> 2082,V=> 330,的h=> 18453,' F'=> 10049,=> 1,'I'=> 66548,'A'=> 1345,'O'=> 354,'N'=> 56696,'M'=> 20747,L => 39914,=> 7,'P'=> 20917,R=> 749,'O'=> 48425} 真正0.51 用户0.49 SYS 0.02
蟒蛇(numpy的):0.07秒
根据<αhref="http://stackoverflow.com/questions/2522152/python-is-a-dictionary-slow-to-find-frequency-of-each-character/2524913#2524913">Ants Aasma的回答(修改,以支持UNI code):
#!的/ usr /斌/包膜蟒蛇
进口codeCS,itertools,运营商,SYS
进口numpy的
文件名= sys.argv中[1]如果len(sys.argv中)&GT; 1,否则'的/ usr /共享/词典/美国英语
#UCS2或UCS4蟒蛇?
DTYPE = {2:numpy.uint16,4:numpy.uint32} [LEN(缓冲区(UU))]
#计数序号
文字= codecs.open(文件名,编码=UTF-8)。阅读()
A = numpy.frombuffer(文字,DTYPE = DTYPE)
计数= numpy.bincount(一)
#pretty的打印
计数= [(unichr(ⅰ),ⅴ)为I,V在枚举(计数)当v]
counts.sort(键= operator.itemgetter(1))
打印''。加入('(%的%d)%C对C的计数,如果C [0]不是\ t \ N')
输出:
(A1)(I2)(O2)(A3)(U3)(C4)(一个6)(E6)(N6)(A7)(A10)(O10)(O12)(U12)(E 29)(X42)(Q63)(U117)(é128)(Y140)(Z150)(V330)(I344) (O354)(F502)(W507)(N531)(J539)(E618)(K656)(R749)( D809)(G811)(T858)(H860)(L,912)(P974)(A1345)(J1376)(B的1387)(Q1447)(S1515)(C1549)(M1686)(X2082)(Z3088)(W6924)(V7573) (K7800)(F10049)(y的12367)(b的14137)(h的18453)(米20747)(P20917)(克22262)( 24511)(U25810)(D28068)(C30020)(L39914)(O48425)(T51588)(R56286)(N 56696)(A62778)(I66548)(S86620)(E87823)
真正0.07
用户0.06
SYS 0.01
C ++:0.05秒
// $ G ++ * .CC -lboost_program_options
// $ ./a.out的/ usr /共享/词典/美国英语
#包括&LT;的iostream&GT;
#包括&LT; fstream的&GT;
#包括&LT; cstdlib&GT; // 出口
#包括&LT;升压/ program_options /细节/ utf8_ codecvt_facet.hpp&GT;
#包括&LT;提升/ TR1 / unordered_map.hpp&GT;
#包括&LT;升压/ foreach.hpp&GT;
INT主(INT ARGC,字符* argv的[]){
使用名字空间std;
//打开输入文件
如果(的argc!= 2){
CERR&LT;&LT; 用法:&LT;&LT;的argv [0]&LT;&LT; &LT;文件名&GT; \ N的;
出口(2);
}
wifstream F(的argv [ARGC-1]);
//假设文件有UTF-8编码
区域utf8_locale(区域(),
新的boost :: program_options ::详细:: utf8_ codecvt_facet);
f.imbue(utf8_locale);
//数字符频率
类型定义的std :: tr1 :: unordered_map&LT; wchar_t的,为size_t&GT; hashtable_t;
hashtable_t计数;
对于(wchar_t的通道; F&GT;&GT; CH;)
计数[CH] ++;
//打印结果
(下称output.utf8)wofstream;
of.imbue(utf8_locale);
BOOST_FOREACH(hashtable_t :: value_type的我,计数)
的&LT;&LT; (&其中;&其中; i.first&其中;&所述;&其中;&其中; i.second&其中;&所述;);
的&LT;&LT; ENDL;
}
结果:
$猫output.utf8
(I 2)(O 354)(P 974)(Q 63)(R 749)(S 1515)(6例)(T 858)(U 117)(O 10)(O 2) (V 330)(W 507)(X 42)(O 12)(Y 140)(Z 150)(U 3)(ü12)(一62778)(B 14,137)(C 30020)(D 28068)(E 87823)(F 10,049)(G 22262)(H 18453)(一66548)(十1376)(K 7800)(L 39914)(M 20747)(N 56696)(O 48425)(第20917)(Q 1447) (R 56286)(S 86620)(T 51588)(U 25810)(1)('24511)(V 7573)(附6924)(X 2082)(Y 12,367)(Z 3,088)(A 1345)(B 1387)(C 1549)(10)(6)(D 809)(E 618)(F 502)(7)(A 3)(G 811)(H 860)(C 4)(我344) (十539)(E 29)(K 656)(E 128)(E 6)(L 912)(M 1,686)(N 531)
C(ASCII):0.0079秒
// $ GCC -O3 cc_ascii.c -o cc_ascii和放大器;&安培;时间-p ./cc_ascii&LT; input.txt的
#包括&LT; stdio.h中&GT;
枚举{N = 256};
为size_t计数[N];
诠释的主要(无效){
// count个字符
INT CH = -1;
而((CH =的getchar())!= EOF)
++计数[CH]
//打印结果
为size_t I = 0;
对于(; I&LT; N ++ I)
如果(计数[我])
的printf((%C'%祖),(INT)I,计数[我]);
返回0;
}
I am trying to find a frequency of each symbol in any given text using an algorithm of O(n) complexity. My algorithm looks like:
s = len(text)
P = 1.0/s
freqs = {}
for char in text:
try:
freqs[char]+=P
except:
freqs[char]=P
but I doubt that this dictionary-method is fast enough, because it depends on the underlying implementation of the dictionary methods. Is this the fastest method?
UPDATE: there is no increase in speed if collections and integers are used. It is because the algorithm is already of O(n) complexity, so no essential speedup is possible.
For example, results for 1MB text:
without collections:
real 0m0.695s
with collections:
real 0m0.625s
Performance comparison
Note: time in the table doesn't include the time it takes to load files.
| approach | american-english, | big.txt, | time w.r.t. defaultdict |
| | time, seconds | time, seconds | |
|----------------+-------------------+---------------+-------------------------|
| Counter | 0.451 | 3.367 | 3.6 |
| setdefault | 0.348 | 2.320 | 2.5 |
| list | 0.277 | 1.822 | 2 |
| try/except | 0.158 | 1.068 | 1.2 |
| defaultdict | 0.141 | 0.925 | 1 |
| numpy | 0.012 | 0.076 | 0.082 |
| S.Mark's ext. | 0.003 | 0.019 | 0.021 |
| ext. in Cython | 0.001 | 0.008 | 0.0086 |
#+TBLFM: $4=$3/@7$3;%.2g
The files used: '/usr/share/dict/american-english'
and 'big.txt'
.
The script that compares 'Counter', 'setdefault', 'list', 'try/except', 'defaultdict', 'numpy', 'cython' -based, and @S.Mark's solutions is at http://gist.github.com/347000
The fastest solution is Python extension written in Cython:
import cython
@cython.locals(
chars=unicode,
i=cython.Py_ssize_t,
L=cython.Py_ssize_t[0x10000])
def countchars_cython(chars):
for i in range(0x10000): # unicode code points > 0xffff are not supported
L[i] = 0
for c in chars:
L[c] += 1
return {unichr(i): L[i] for i in range(0x10000) if L[i]}
Previous comparison:
* python (dict) : 0.5 seconds
* python (list) : 0.5 (ascii) (0.2 if read whole file in memory)
* perl : 0.5
* python (numpy): 0.07
* c++ : 0.05
* c : 0.008 (ascii)
Input data:
$ tail /usr/share/dict/american-english
éclat's
élan
élan's
émigré
émigrés
épée
épées
étude
étude's
études
$ du -h /usr/share/dict/american-english
912K /usr/share/dict/american-english
python (Counter): 0.5 seconds
#!/usr/bin/env python3.1
import collections, fileinput, textwrap
chars = (ch for word in fileinput.input() for ch in word.rstrip())
# faster (0.4s) but less flexible: chars = open(filename).read()
print(textwrap.fill(str(collections.Counter(chars)), width=79))
Run it:
$ time -p python3.1 count_char.py /usr/share/dict/american-english
Counter({'e': 87823, 's': 86620, 'i': 66548, 'a': 62778, 'n': 56696, 'r': 56286, 't': 51588, 'o': 48425, 'l': 39914, 'c': 30020, 'd': 28068, 'u': 25810, "'": 24511, 'g': 22262, 'p': 20917, 'm': 20747, 'h': 18453, 'b': 14137, 'y': 12367, 'f': 10049, 'k': 7800, 'v': 7573, 'w': 6924, 'z': 3088, 'x': 2082, 'M': 1686, 'C': 1549, 'S': 1515, 'q': 1447, 'B': 1387, 'j': 1376, 'A': 1345, 'P': 974, 'L': 912, 'H': 860, 'T': 858, 'G': 811, 'D': 809, 'R': 749, 'K': 656, 'E': 618, 'J': 539, 'N': 531, 'W': 507, 'F': 502, 'O': 354, 'I': 344, 'V': 330, 'Z': 150, 'Y': 140, 'é': 128, 'U': 117, 'Q': 63, 'X': 42, 'è': 29, 'ö': 12, 'ü': 12, 'ó': 10, 'á': 10, 'ä': 7, 'ê': 6, 'â': 6, 'ñ': 6, 'ç': 4, 'å': 3, 'û': 3, 'í': 2, 'ô': 2, 'Å': 1}) real 0.44 user 0.43 sys 0.01
perl: 0.5 seconds
time -p perl -MData::Dumper -F'' -lanwe'$c{$_}++ for (@F);
END{ $Data::Dumper::Terse = 1; $Data::Dumper::Indent = 0; print Dumper(\%c) }
' /usr/share/dict/american-english
Output:
{'S' => 1515,'K' => 656,'' => 29,'d' => 28068,'Y' => 140,'E' => 618,'y' => 12367,'g' => 22262,'e' => 87823,'' => 2,'J' => 539,'' => 241,'' => 3,'' => 6,'' => 4,'' => 128,'D' => 809,'q' => 1447,'b' => 14137,'z' => 3088,'w' => 6924,'Q' => 63,'' => 10,'M' => 1686,'C' => 1549,'' => 10,'L' => 912,'X' => 42,'P' => 974,'' => 12,'\'' => 24511,'' => 6,'a' => 62778,'T' => 858,'N' => 531,'j' => 1376,'Z' => 150,'u' => 25810,'k' => 7800,'t' => 51588,'' => 6,'W' => 507,'v' => 7573,'s' => 86620,'B' => 1387,'H' => 860,'c' => 30020,'' => 12,'I' => 344,'' => 3,'G' => 811,'U' => 117,'F' => 502,'' => 2,'r' => 56286,'x' => 2082,'V' => 330,'h' => 18453,'f' => 10049,'' => 1,'i' => 66548,'A' => 1345,'O' => 354,'n' => 56696,'m' => 20747,'l' => 39914,'' => 7,'p' => 20917,'R' => 749,'o' => 48425} real 0.51 user 0.49 sys 0.02
python (numpy): 0.07 seconds
Based on Ants Aasma's answer (modified to support unicode):
#!/usr/bin/env python
import codecs, itertools, operator, sys
import numpy
filename = sys.argv[1] if len(sys.argv)>1 else '/usr/share/dict/american-english'
# ucs2 or ucs4 python?
dtype = {2: numpy.uint16, 4: numpy.uint32}[len(buffer(u"u"))]
# count ordinals
text = codecs.open(filename, encoding='utf-8').read()
a = numpy.frombuffer(text, dtype=dtype)
counts = numpy.bincount(a)
# pretty print
counts = [(unichr(i), v) for i, v in enumerate(counts) if v]
counts.sort(key=operator.itemgetter(1))
print ' '.join('("%s" %d)' % c for c in counts if c[0] not in ' \t\n')
Output:
("Å" 1) ("í" 2) ("ô" 2) ("å" 3) ("û" 3) ("ç" 4) ("â" 6) ("ê" 6) ("ñ" 6) ("ä" 7) ("á" 10) ("ó" 10) ("ö" 12) ("ü" 12) ("è" 29) ("X" 42) ("Q" 63) ("U" 117) ("é" 128) ("Y" 140) ("Z" 150) ("V" 330) ("I" 344) ("O" 354) ("F" 502) ("W" 507) ("N" 531) ("J" 539) ("E" 618) ("K" 656) ("R" 749) ("D" 809) ("G" 811) ("T" 858) ("H" 860) ("L" 912) ("P" 974) ("A" 1345) ("j" 1376) ("B" 1387) ("q" 1447) ("S" 1515) ("C" 1549) ("M" 1686) ("x" 2082) ("z" 3088) ("w" 6924) ("v" 7573) ("k" 7800) ("f" 10049) ("y" 12367) ("b" 14137) ("h" 18453) ("m" 20747) ("p" 20917) ("g" 22262) ("'" 24511) ("u" 25810) ("d" 28068) ("c" 30020) ("l" 39914) ("o" 48425) ("t" 51588) ("r" 56286) ("n" 56696) ("a" 62778) ("i" 66548) ("s" 86620) ("e" 87823)
real 0.07
user 0.06
sys 0.01
c++: 0.05 seconds
// $ g++ *.cc -lboost_program_options
// $ ./a.out /usr/share/dict/american-english
#include <iostream>
#include <fstream>
#include <cstdlib> // exit
#include <boost/program_options/detail/utf8_codecvt_facet.hpp>
#include <boost/tr1/unordered_map.hpp>
#include <boost/foreach.hpp>
int main(int argc, char* argv[]) {
using namespace std;
// open input file
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <filename>\n";
exit(2);
}
wifstream f(argv[argc-1]);
// assume the file has utf-8 encoding
locale utf8_locale(locale(""),
new boost::program_options::detail::utf8_codecvt_facet);
f.imbue(utf8_locale);
// count characters frequencies
typedef std::tr1::unordered_map<wchar_t, size_t> hashtable_t;
hashtable_t counts;
for (wchar_t ch; f >> ch; )
counts[ch]++;
// print result
wofstream of("output.utf8");
of.imbue(utf8_locale);
BOOST_FOREACH(hashtable_t::value_type i, counts)
of << "(" << i.first << " " << i.second << ") ";
of << endl;
}
Result:
$ cat output.utf8
(í 2) (O 354) (P 974) (Q 63) (R 749) (S 1,515) (ñ 6) (T 858) (U 117) (ó 10) (ô 2) (V 330) (W 507) (X 42) (ö 12) (Y 140) (Z 150) (û 3) (ü 12) (a 62,778) (b 14,137) (c 30,020) (d 28,068) (e 87,823) (f 10,049) (g 22,262) (h 18,453) (i 66,548) (j 1,376) (k 7,800) (l 39,914) (m 20,747) (n 56,696) (o 48,425) (p 20,917) (q 1,447) (r 56,286) (s 86,620) (t 51,588) (u 25,810) (Å 1) (' 24,511) (v 7,573) (w 6,924) (x 2,082) (y 12,367) (z 3,088) (A 1,345) (B 1,387) (C 1,549) (á 10) (â 6) (D 809) (E 618) (F 502) (ä 7) (å 3) (G 811) (H 860) (ç 4) (I 344) (J 539) (è 29) (K 656) (é 128) (ê 6) (L 912) (M 1,686) (N 531)
c (ascii): 0.0079 seconds
// $ gcc -O3 cc_ascii.c -o cc_ascii && time -p ./cc_ascii < input.txt
#include <stdio.h>
enum { N = 256 };
size_t counts[N];
int main(void) {
// count characters
int ch = -1;
while((ch = getchar()) != EOF)
++counts[ch];
// print result
size_t i = 0;
for (; i < N; ++i)
if (counts[i])
printf("('%c' %zu) ", (int)i, counts[i]);
return 0;
}
这篇关于Python的 - 是一本字典慢找到每个字符的频率是多少?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!