- [纯python和pandas速度比较]
- [一百万数据做测试, 六个统计量]
- [或许多计算几个不是放到list里再计算一遍会更快点? 算了, 反正纯python太慢都用pandas了, 不再测试了]
- [那么多几个统计量呢? 比如20个? 30个? 会有追平的情况么?]
- [是只有基本类型会这样还是所有的统计都是pandas快?]
- [尝试使用纯python多次循环, 单次循环与pandas对比, go 单次与多次循环对比]
- [一百万数据做测试, 六个统计量]
纯python和pandas速度比较
有个数据分析的任务, 本来打算用pandas做, 后来想的是要遍历一个大列表(十万左右, 4000组)几十遍
使用pandas会不会因为一直在遍历导致速度比较慢
尝试使用纯python做, 争取一次遍历把所有的统计项计算出来
所以先试试看速度会有多少差距
先说结论, 不要用纯python来写数据分析, 使用pandas会的多
一百万数据做测试, 六个统计量
python:
from pandas import DataFrame
import time
import random
def create_test_data(nums=1000*1000):
with open("data.txt", "w") as f:
for i in range(nums):
f.write(f"{random.randint(1, 1000)}\n")
if __name__ == "__main__":
with open("./data.txt")as f:
data = f.readlines()
call_times_pd = DataFrame({"call_times": data})
call_times_pd[["call_times"]] = call_times_pd[['call_times']].astype(int)
call_times_py = [int(i) for i in data]
st = time.time()
# call_times_py = [int(i) for i in data]
lte_200_py = []
lt_500_py = []
eq_600_py = []
xx = []
yy = []
zz = []
for i in call_times_py:
if i <= 200:
lte_200_py.append(i)
if i < 500:
lt_500_py.append(i)
if i == 600:
eq_600_py.append(i)
if i <= 100:
xx.append(i)
if i <= 300:
yy.append(i)
if i > 800:
zz.append(i)
print(len(lte_200_py), len(lt_500_py), len(eq_600_py))
print(time.time() - st)
stt = time.time()
# call_times_pd = DataFrame({"call_times": data})
# call_times_pd[["call_times"]] = call_times_pd[['call_times']].astype(int)
lte_200_pd = call_times_pd[call_times_pd.call_times <= 200]
lt_500_pd = call_times_pd[call_times_pd.call_times < 500]
eq_600_pd = call_times_pd[call_times_pd.call_times == 600]
xxx = call_times_pd[call_times_pd.call_times <= 100]
yyy = call_times_pd[call_times_pd.call_times <= 300]
zzz = call_times_pd[call_times_pd.call_times > 800]
# xxx = call_times_pd[call_times_pd.call_times <= 100]
# yyy = call_times_pd[call_times_pd.call_times <= 300]
# zzz = call_times_pd[call_times_pd.call_times > 800]
# xxx = call_times_pd[call_times_pd.call_times <= 100]
# yyy = call_times_pd[call_times_pd.call_times <= 300]
# zzz = call_times_pd[call_times_pd.call_times > 800]
print(len(lte_200_pd), len(lt_500_pd), len(eq_600_pd))
print(time.time() - stt)
最后测试发现不使用pandas会比使用pandas慢一个数量级
为什么用list而不是直接用int, 因为需求中会要求获取次数, 平均值
例如, 小于200的总次数, 平均值, 中位数等等
或许多计算几个不是放到list里再计算一遍会更快点? 算了, 反正纯python太慢都用pandas了, 不再测试了
那么多几个统计量呢? 比如20个? 30个? 会有追平的情况么?
测试了10, 20, 30, 50, 100, 200, 500, 都是纯python慢很多
时间消耗大概都是7倍左右, 没有缩短的迹象
有可能是因为耗时的操作不是循环, 而是列表操作
将操作中的列表, 改为数字, 会少一些, 但是耗时比还是不会变动, 一直是5~6倍
def run_py_code():
xxx, yyy, zzz, lt_500_py_int, lte_200_py_int, eq_600_py_int = 0, 0, 0, 0, 0, 0
def inner_def():
nonlocal xxx, yyy, zzz, lt_500_py_int, lte_200_py_int, eq_600_py_int
if i <= 200:
# lte_200_py.append(i)
lte_200_py_int += 0
if i < 500:
# lt_500_py.append(i)
lt_500_py_int += 0
if i == 600:
# eq_600_py.append(i)
eq_600_py_int += 0
if i <= 100:
# xx.append(i)
xxx += 0
if i <= 300:
# yy.append(i)
yyy += 0
if i > 800:
# zz.append(i)
zzz += 0
inner_def()
这个值和使用go语言来做(分别是单循环做全部统计, 每次循环统计一个量)差不太多
TODO 是只有基本类型会这样还是所有的统计都是pandas快?
暂时不做测试, 一般不会这么用, 需要的时候再说
尝试使用纯python多次循环, 单次循环与pandas对比, go 单次与多次循环对比
from pandas import DataFrame
import time
import random
def create_test_data(nums=1000*1000):
with open("data.txt", "w") as f:
for i in range(nums):
f.write(f"{random.randint(1, 1000)}\n")
def py_code2(call_times_py, times=10):
lte_200_py_int = 0
lt_500_py_int = 0
eq_600_py_int = 0
xxx = 0
yyy = 0
zzz = 0
def inner_200():
nonlocal lte_200_py_int
if i <= 200:
lte_200_py_int += 1
def inner_500():
nonlocal lt_500_py_int
if i < 500:
lt_500_py_int += 1
def inner_600():
nonlocal eq_600_py_int
if i == 600:
eq_600_py_int+=1
def inner_xxx():
nonlocal xxx
if i<=100:
xxx+=1
def inner_yyy():
nonlocal yyy
if i<= 300:
yyy+=1
def inner_zzz():
nonlocal zzz
if i> 800:
zzz+=1
st = time.time()
for i in range(times):
for i in call_times_py:
inner_200()
for i in call_times_py:
inner_500()
for i in call_times_py:
inner_600()
for i in call_times_py:
inner_xxx()
for i in call_times_py:
inner_yyy()
for i in call_times_py:
inner_zzz()
return time.time()-st
def py_code(call_times_py, times=10):
# call_times_py = [int(i) for i in data]
lte_200_py = []
lt_500_py = []
eq_600_py = []
xx = []
yy = []
zz = []
lte_200_py_int = 0
lt_500_py_int = 0
eq_600_py_int = 0
xxx = 0
yyy = 0
zzz = 0
XXX=30
def inner_def():
nonlocal xxx, yyy, zzz, lt_500_py_int, lte_200_py_int, eq_600_py_int
if i <= 200:
lte_200_py.append(i)
# lte_200_py_int += 0
if i < 500:
lt_500_py.append(i)
# lt_500_py_int += 0
if i == 600:
eq_600_py.append(i)
# eq_600_py_int += 0
if i <= 100:
xx.append(i)
# xxx += 0
if i <= 300:
yy.append(i)
# yyy += 0
if i > 800:
zz.append(i)
# zzz += 0
st = time.time()
for i in call_times_py:
for j in range(times):
inner_def()
lte_200_py = []
lt_500_py = []
eq_600_py = []
xx = []
yy = []
zz = []
# print(len(lte_200_py), len(lt_500_py), len(eq_600_py))
py_time = time.time() - st
return py_time
def pd_code(call_times_pd, times=10):
stt = time.time()
def inner_def():
lte_200_pd = call_times_pd[call_times_pd.call_times <= 200]
lt_500_pd = call_times_pd[call_times_pd.call_times < 500]
eq_600_pd = call_times_pd[call_times_pd.call_times == 600]
xxx = call_times_pd[call_times_pd.call_times <= 100]
yyy = call_times_pd[call_times_pd.call_times <= 300]
zzz = call_times_pd[call_times_pd.call_times > 800]
for i in range(times):
inner_def()
pd_time = time.time() - stt
return pd_time
if __name__ == "__main__":
with open("./data.txt")as f:
data = f.readlines()
call_times_pd = DataFrame({"call_times": data})
call_times_pd[["call_times"]] = call_times_pd[['call_times']].astype(int)
call_times_py = [int(i) for i in data]
for i in [10, 20, 30, 50, 100, 200, 500]:
for j in range(1, 4):
py_time = py_code(call_times_py, i)
pd_time = pd_code(call_times_pd, i)
py_time2 = py_code2(call_times_py, i)
print(f"统计量:{i} 次数:{j} 时间比: {py_time/pd_time} 详细时长: py: {py_time} pd: {pd_time} py2: {py_time2}")
统计量:10 次数:1 时间比: 8.446309077857356 详细时长: py: 5.8574230670928955 pd: 0.6934890747070312 py2: 10.970935583114624
统计量:10 次数:2 时间比: 8.19909197864528 详细时长: py: 5.218891143798828 pd: 0.6365206241607666 py2: 9.700028657913208
统计量:10 次数:3 时间比: 8.45852420498877 详细时长: py: 5.458881855010986 pd: 0.6453704833984375 py2: 10.23711371421814
统计量:20 次数:1 时间比: 7.564866761185951 详细时长: py: 9.94292688369751 pd: 1.3143558502197266 py2: 19.783536195755005
统计量:20 次数:2 时间比: 7.360162732624824 详细时长: py: 9.758050680160522 pd: 1.3257927894592285 py2: 19.761258363723755
统计量:20 次数:3 时间比: 7.508303230230394 详细时长: py: 9.933343410491943 pd: 1.3229811191558838 py2: 19.530989408493042
统计量:30 次数:1 时间比: 7.833986134979155 详细时长: py: 14.523708581924438 pd: 1.853935956954956 py2: 32.46154570579529
统计量:30 次数:2 时间比: 7.621607390748258 详细时长: py: 14.912060499191284 pd: 1.9565505981445312 py2: 29.734895944595337
统计量:30 次数:3 时间比: 7.275279705214964 详细时长: py: 14.678386211395264 pd: 2.0175700187683105 py2: 30.317322492599487
统计量:50 次数:1 时间比: 7.725853759509482 详细时长: py: 23.88418436050415 pd: 3.0914621353149414 py2: 50.774309158325195
统计量:50 次数:2 时间比: 7.254025451527843 详细时长: py: 23.80484104156494 pd: 3.281604290008545 py2: 54.80711579322815
统计量:50 次数:3 时间比: 8.11059675328286 详细时长: py: 26.42996597290039 pd: 3.258695602416992 py2: 49.66933298110962
统计量:100 次数:1 时间比: 7.395159931692905 详细时长: py: 47.11621284484863 pd: 6.371222972869873 py2: 100.95593667030334
统计量:100 次数:2 时间比: 7.285887516396374 详细时长: py: 47.732889890670776 pd: 6.551417350769043 py2: 100.88967823982239
统计量:100 次数:3 时间比: 7.5608031699271185 详细时长: py: 48.61383056640625 pd: 6.429717779159546 py2: 108.31028723716736
统计量:200 次数:1 时间比: 7.168724536572008 详细时长: py: 94.84511423110962 pd: 13.230402946472168 py2: 203.78652048110962
统计量:200 次数:2 时间比: 7.64953061578108 详细时长: py: 101.12934279441833 pd: 13.220333099365234 py2: 208.26342511177063
统计量:200 次数:3 时间比: 7.221915299871638 详细时长: py: 94.99437260627747 pd: 13.153625965118408 py2: 208.55407118797302
统计量:500 次数:1 时间比: 7.733304171085339 详细时长: py: 249.97211408615112 pd: 32.32410216331482 py2: 507.01753211021423
统计量:500 次数:2 时间比: 7.523341562433384 详细时长: py: 245.09230089187622 pd: 32.577585220336914 py2: 521.6754240989685
统计量:500 次数:3 时间比: 9.62083059863606 详细时长: py: 243.66297483444214 pd: 25.32660484313965 py2: 448.2097132205963
package main
import (
"bufio"
"fmt"
"io"
"os"
"strconv"
"strings"
"time"
)
func once_run(new_files * [1000000]int, times int) float64 {
st := time.Now()
lte_200 := []int{}
lt_500 := []int{}
eq_600 := []int{}
xx := []int{}
yy := []int{}
zz := []int{}
for i:=0;i<times;i++{
// for index:=0;index<len(new_files);index++{
// int_v := new_files[index]
for _, int_v := range new_files {
if int_v <= 200 {
lte_200 = append(lte_200, int_v)
}
if int_v < 500 {
lt_500 = append(lt_500, int_v)
}
if int_v == 600 {
eq_600 = append(eq_600, int_v)
}
if int_v <= 100 {
xx = append(xx, int_v)
}
if int_v <= 300 {
yy = append(yy, int_v)
}
if int_v > 800 {
zz = append(zz, int_v)
}
}
lte_200 = []int{}
lt_500 = []int{}
eq_600 = []int{}
xx = []int{}
yy = []int{}
zz = []int{}
}
// fmt.Println(len(lte_200), len(lt_500), len(eq_600), len(xx), len(yy), len(zz))
// fmt.Println(len(lte_200), len(lt_500), len(eq_600))
et := time.Now()
return et.Sub(st).Seconds()
}
func many_run(new_files * [1000000]int, times int)float64{
st := time.Now()
lte_200 := []int{}
lt_500 := []int{}
eq_600 := []int{}
xx := []int{}
yy := []int{}
zz := []int{}
for i:=0;i<times;i++{
for _, int_v := range new_files{
if int_v <= 200{
lte_200 = append(lte_200, int_v)
}
}
for _, int_v := range new_files{
if int_v < 500{
lt_500 = append(lt_500, int_v)
}
}
for _, int_v := range new_files{
if int_v == 600{
eq_600 = append(eq_600, int_v)
}
}
for _, int_v := range new_files{
if int_v <= 100{
xx = append(xx, int_v)
}
}
for _, int_v := range new_files{
if int_v <= 300{
yy = append(yy, int_v)
}
}
for _, int_v := range new_files{
if int_v > 800{
zz = append(zz, int_v)
}
}
lte_200 = []int{}
lt_500 = []int{}
eq_600 = []int{}
xx = []int{}
yy = []int{}
zz = []int{}
}
// fmt.Println(len(lte_200), len(lt_500), len(eq_600))
et := time.Now()
return et.Sub(st).Seconds()
}
func main() {
filepath := "./data.txt"
file, err := os.OpenFile(filepath, os.O_RDONLY, 0777)
if err != nil {
fmt.Println(err)
}
defer file.Close()
buf := bufio.NewReader(file)
file_lines := []string{}
for {
line, err := buf.ReadString('\n')
if err != nil {
if err == io.EOF {
break
} else {
fmt.Println(err)
}
}
line = strings.TrimSpace(line)
file_lines = append(file_lines, line)
}
var new_files [1000000]int
for i := 0; i < len(file_lines); i++ {
int_v, _ := strconv.Atoi(file_lines[i])
new_files[i] = int_v
}
/*
for _, ii := range []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10} {
fmt.Println(ii, "--")
for _, int_v := range new_files {
if int_v <= 200 {
lte_200 = append(lte_200, int_v)
}
}
}
*/
for _, i := range []int{10, 20, 30, 50, 100, 200, 500}{
// for _, i := range []int{500}{
for j:=0;j<4;j++{
time_used_once := once_run(&new_files, i)
time_used_many := many_run(&new_files, i)
fmt.Println(i," ", j,":--> ", "once" ,time_used_once, "many", time_used_many)
}
}
}
可以看到, golang的单次循环和多次循环的差距没有那么大, 所以除了特殊情况, 最主要的还是循环体里的内容
10 0 :–> once 0.270043332 many 0.363176675
10 1 :–> once 0.276643586 many 0.352971341
10 2 :–> once 0.25890732 many 0.35560765
10 3 :–> once 0.245769677 many 0.370233654
20 0 :–> once 0.589726051 many 0.769544436
20 1 :–> once 0.580975835 many 0.753842546
20 2 :–> once 0.600672759 many 0.721600799
20 3 :–> once 0.596183651 many 0.766575763
30 0 :–> once 0.882573551 many 1.113109061
30 1 :–> once 0.880121284 many 1.160065195
30 2 :–> once 0.867385259 many 1.133052454
30 3 :–> once 0.889980355 many 1.136251198
50 0 :–> once 1.427478098 many 1.850872163
50 1 :–> once 1.501920479 many 1.898363955
50 2 :–> once 1.502236307 many 1.8533934570000001
50 3 :–> once 1.5080970219999998 many 1.884553839
100 0 :–> once 2.934879984 many 3.7428952300000002
100 1 :–> once 2.949133964 many 3.768129714
100 2 :–> once 2.879401105 many 3.774616441
100 3 :–> once 2.908388572 many 3.75524067
200 0 :–> once 5.265939245 many 7.3770994420000005
200 1 :–> once 5.676770116 many 7.516143486
200 2 :–> once 5.835790768 many 7.420399405
200 3 :–> once 5.757892503 many 7.498774821
500 0 :–> once 13.946337923 many 18.661140954
500 1 :–> once 13.558824813 many 18.278247642
500 2 :–> once 13.671900874 many 18.228428328
500 3 :–> once 12.964145702 many 18.017974093
有疑问加站长微信联系(非本文作者)