散列表实现查找

✨文章摘要（AI生成）

作为一名技术专家，我深入探讨了散列表的实现及其查找过程。文章首先通过 Python 代码从 Excel 文件中读取数据，并去除缺失值，最终得到 75 条企业信息。接着，我实现了两个主要的哈希方法：平方取中法和除留余数法，并分别应用开放地址法和公共溢出法解决冲突。

在查找过程中，用户可以选择根据电话号码或企业名称进行搜索。通过对比两种查找方法的平均查找长度（ASL），结果显示开放地址法的效率明显优于公共溢出法，且平方取中法与除留余数法在地址映射的重复性方面表现相似。最终得出的结果表明，数据重复度越高，查找效率越低，开放地址法在存储空间和查找效率之间达成了一种平衡。

Library

python

import pandas as pd
import numpy as np
import time

读取数据

python

df = pd.read_excel('重庆市印刷和记录媒介复制业 754.xlsx')
df.dropna(axis=0, how='any')  # 去除非数
print("表长为：", len(df))
df.head()

表长为： 75

	ID	企业名称	电话	企业地址
0	0	万州区永佳路万德印刷厂	15178905742	重庆市万州区双河口永佳路 325 号
1	1	覃彩虹	13594410133	重庆市万州区熊家镇古城大道 296 号
2	2	重庆优得利印刷有限公司	023-65903102	重庆市江津区双福工业园 1 幢 1 号
3	3	重庆市开州区森美印刷有限公司	15608330060	重庆市开州区云枫街道平桥社区桔乡路 369 号门市
4	4	吕兴华	13896015224	重庆市璧山区大路街道天星街 23 号

get_nms

python

def get_nums(s):
    '''
    逐位相加其 ASCII 码
    '''
    s = str(s)
    nums = 0
    for s0 in s:
        if(s0 == '-'):
            continue
        try:
            nums += ord(s0)
        except:
            print("error: ",s)
    return nums
get_nums("重庆优得利印刷有限公司")

平方取中法

python

df['nums_电话'] = df['电话'].apply(get_nums)
df['nums_电话^2'] = df['nums_电话'].apply(lambda x: np.square(x))
df['nums_企业名称'] = df['企业名称'].apply(get_nums)
# df['nums_电话^4'] = df['nums_电话^2'].apply(lambda x: np.square(x))
df.head()

	ID	企业名称	电话	企业地址	nums_电话	nums_电话^2	nums_企业名称
0	0	万州区永佳路万德印刷厂	15178905742	重庆市万州区双河口永佳路 325 号	577	332929	257952
1	1	覃彩虹	13594410133	重庆市万州区熊家镇古城大道 296 号	562	315844	94053
2	2	重庆优得利印刷有限公司	023-65903102	重庆市江津区双福工业园 1 幢 1 号	559	312481	276879
3	3	重庆市开州区森美印刷有限公司	15608330060	重庆市开州区云枫街道平桥社区桔乡路 369 号门市	560	313600	364365
4	4	吕兴华	13896015224	重庆市璧山区大路街道天星街 23 号	569	323761	63703

python

def get_mid(x):
    '''
    取中间三位数作为地址
    '''
    return int(x/10)%1000

python

df['adr_电话'] = df['nums_电话^2'].apply(get_mid)
df['adr_企业名称'] = df['nums_企业名称'].apply(get_mid)
df.head(100)

	ID	企业名称	电话	企业地址	nums_电话	nums_电话^2	nums_企业名称	adr_电话	adr_企业名称
0	0	万州区永佳路万德印刷厂	15178905742	重庆市万州区双河口永佳路 325 号	577	332929	257952	292	795
1	1	覃彩虹	13594410133	重庆市万州区熊家镇古城大道 296 号	562	315844	94053	584	405
2	2	重庆优得利印刷有限公司	023-65903102	重庆市江津区双福工业园 1 幢 1 号	559	312481	276879	248	687
3	3	重庆市开州区森美印刷有限公司	15608330060	重庆市开州区云枫街道平桥社区桔乡路 369 号门市	560	313600	364365	360	436
4	4	吕兴华	13896015224	重庆市璧山区大路街道天星街 23 号	569	323761	63703	376	370
...	...	...	...	...	...	...	...	...	...
95	95	重庆涵天包装印务有限公司	023-72231721	重庆市涪陵区太极大道 34 号负一楼	558	311364	318409	136	840
96	96	垫江县金辉印刷厂	13110182246	重庆市垫江县周嘉镇迎春路	557	310249	209484	24	948
97	97	重庆凯翔包装印务有限公司	13709401186	重庆市永川区大安街道办事处大安工业园区	568	322624	321198	262	119
98	98	丰都县蓝图印务有限公司	15803661811	重庆市丰都县三合街道商业二路 117 号附 2 号	568	322624	284565	262	456
99	99	重庆毅然包装印刷有限公司	13509402066	重庆市綦江区文龙街道大田湾 6 号	564	318096	323964	809	396

除留余数法

python

df['adr_电话_1'] = df['nums_电话^2']%1007
df['adr_企业名称_1'] = df['nums_企业名称']%1007
df.head()

	ID	企业名称	电话	企业地址	nums_电话	nums_电话^2	nums_企业名称	adr_电话	adr_企业名称	adr_电话_1	adr_企业名称_1
0	0	万州区永佳路万德印刷厂	15178905742	重庆市万州区双河口永佳路 325 号	577	332929	257952	292	795	619	160
1	1	覃彩虹	13594410133	重庆市万州区熊家镇古城大道 296 号	562	315844	94053	584	405	653	402
2	2	重庆优得利印刷有限公司	023-65903102	重庆市江津区双福工业园 1 幢 1 号	559	312481	276879	248	687	311	961
3	3	重庆市开州区森美印刷有限公司	15608330060	重庆市开州区云枫街道平桥社区桔乡路 369 号门市	560	313600	364365	360	436	423	838
4	4	吕兴华	13896015224	重庆市璧山区大路街道天星街 23 号	569	323761	63703	376	370	514	262

创建哈希表(分别使用开发地址与公共溢出区解决冲突)

python

#初始化为全 0

# 开放地址法所使用的 hash
hash_map_tele = np.zeros(32000)
hash_map_name = np.zeros(8000)

# 公共溢出区所使用的 hash
hash_map_tele_1 = np.zeros(2100)
hash_map_name_1 = np.zeros(2100)

len(hash_map_tele)

python

#探测开放地址法
def create_hash_map_tele(x, adr, df_ID):
    try:
        adr = int(x[adr])
    except:
        print('error: ', adr)
    while(hash_map_tele[adr] != 0):
        adr += 800
    hash_map_tele[adr] = x[df_ID]

def create_hash_map_name(x, adr, df_ID):
    try:
        adr = int(x[adr])
    except:
        print('error: ', adr)
    while(hash_map_name[adr] != 0):
        adr += 800
    hash_map_name[adr] = x[df_ID]

#使用公共溢出区
count1 = 0
count2 = 0
def create_hash_map_tele1(x, adr, df_ID):
    global count1
    if(hash_map_tele_1[x[adr]] == 0):
        hash_map_tele_1[x[adr]] = x[df_ID]
    else:
        hash_map_tele_1[1100 + count1] = x[df_ID]
        count1 += 1

def create_hash_map_name1(x, adr, df_ID):
    global count2
    if(hash_map_name_1[x[adr]] == 0):
        hash_map_name_1[x[adr]] = x[df_ID]
    else:
        hash_map_name_1[1100 + count2] = x[df_ID]
        count2 += 1

python

df.apply(create_hash_map_tele, axis=1, args=('adr_电话', 'ID'))
df.apply(create_hash_map_name, axis=1, args=('adr_企业名称', 'ID'))

df.apply(create_hash_map_tele1, axis=1, args=('adr_电话_1', 'ID'))
df.apply(create_hash_map_name1, axis=1, args=('adr_企业名称_1', 'ID'))

hash_map_tele

array([0., 0., 0., ..., 0., 0., 0.])

查找流程(平方取中+开发地址探测法)-method1

python

# 查找
search_method = int(input("请输入你查找关键词的类型：1，电话查找；2，企业名称查找"))
if(search_method == 1):
    tele = input("请输入你要查找对象的电话号码：")
    tele = int(tele)
    time_start = time.time()
    nums = get_mid(pow(get_nums(tele), 2))
    print("-----base_nums-----\n",nums)
    print("-----hash_map_tele_value-----\n", hash_map_tele[nums])
    print("-----开始查找-----")
    while(df['电话'][hash_map_tele[nums]] != tele):
        # print("-----tele-----\n", df['电话'][hash_map_tele[nums]])
        nums += 800
        # print('-----add_nums-----\n', nums)
        if(nums >= 32000):
            print('查找错误：无该信息')
            break
    time_end = time.time()
    if(nums < 32000):
        print("---------------你查找的信息如下：---------------\n", df.loc[hash_map_tele[nums]])

    print("---------------本次查找耗时：---------------\n",time_end-time_start)
elif(search_method == 2):
    name = input("请输入你要查找对象的企业名称：")
    time_start = time.time()
    nums = get_mid(get_nums(name))
    print("-----base_nums-----\n",nums)
    print("-----hash_map_tele_value-----\n", hash_map_name[nums])
    print("-----开始查找-----")
    while(df['企业名称'][hash_map_name[nums]] != name):
        # print("-----tele-----\n", df['企业名称'][hash_map_name[nums]])
        nums += 800
        # print('-----add_nums-----\n', nums)
        if(nums >= 8000):
            print('查找错误：无该信息')
            break
    time_end = time.time()
    if(nums < 8000):
        print("---------------你查找的信息如下：---------------\n", df.loc[hash_map_name[nums]])

    print("---------------本次查找耗时：---------------\n",time_end-time_start)
else:
    print("请选择正确的查找方式！")

-----base_nums-----
 370
-----hash_map_tele_value-----
 4.0
-----开始查找-----
---------------你查找的信息如下：---------------
 ID                          4
企业名称                      吕兴华
电话                13896015224
企业地址         重庆市璧山区大路街道天星街 23 号
nums_电话                   569
nums_电话^2              323761
adr_电话                    376
nums_企业名称               63703
adr_企业名称                  370
Name: 4, dtype: object
---------------查找耗时：---------------
 0.0009999275207519531

查找流程(除留余数法+公共溢出法)-method2

python

# 查找
search_method = int(input("请输入你查找关键词的类型：1，电话查找；2，企业名称查找"))
if(search_method == 1):
    tele = input("请输入你要查找对象的电话号码：")
    tele = int(tele)
    time_start = time.time()
    nums = get_nums(tele)%1007
    print("-----base_nums-----\n",nums)
    print("-----hash_map_tele_value-----\n", hash_map_tele_1[nums])
    print("-----开始查找-----")
    while(df['电话'][hash_map_tele_1[nums]] != tele):
        # print("-----tele-----\n", df['电话'][hash_map_tele_1[nums]])
        nums += 1
        # print('-----add_nums-----\n', nums)
        if(nums >= 2100):
            print('查找错误：无该信息')
            break
    time_end = time.time()
    if(nums < 2100):
        print("---------------你查找的信息如下：---------------\n", df.loc[hash_map_tele_1[nums]])

    print("---------------本次查找耗时：---------------\n",time_end-time_start)
elif(search_method == 2):
    name = input("请输入你要查找对象的企业名称：")
    time_start = time.time()
    nums = get_nums(name)%1007
    print("-----base_nums-----\n",nums)
    print("-----hash_map_tele_value-----\n", hash_map_name_1[nums])
    print("-----开始查找-----")
    while(df['企业名称'][hash_map_name_1[nums]] != name):
        # print("-----tele-----\n", df['企业名称'][hash_map_name_1[nums]])
        nums += 1
        # print('-----add_nums-----\n', nums)
        if(nums >= 2100):
            print('查找错误：无该信息')
            break
    time_end = time.time()
    if(nums < 2100):
        print("---------------你查找的信息如下：---------------\n", df.loc[hash_map_name_1[nums]])

    print("---------------本次查找耗时：---------------\n",time_end-time_start)
else:
    print("请选择正确的查找方式！")

-----base_nums-----
 262
-----hash_map_tele_value-----
 4.0
-----开始查找-----
---------------你查找的信息如下：---------------
 ID                           4
企业名称                       吕兴华
电话                 13896015224
企业地址          重庆市璧山区大路街道天星街 23 号
nums_电话                    569
nums_电话^2               323761
nums_企业名称                63703
adr_电话                     376
adr_企业名称                   370
adr_电话_1                   514
adr_企业名称_1                 262
Name: 4, dtype: object
---------------本次查找耗时：---------------
 0.005001544952392578

性能分析

python

df.head(10)

	ID	企业名称	电话	企业地址	nums_电话	nums_电话^2	nums_企业名称	adr_电话	adr_企业名称	adr_电话_1	adr_企业名称_1
0	0	万州区永佳路万德印刷厂	15178905742	重庆市万州区双河口永佳路 325 号	577	332929	257952	292	795	619	160
1	1	覃彩虹	13594410133	重庆市万州区熊家镇古城大道 296 号	562	315844	94053	584	405	653	402
2	2	重庆优得利印刷有限公司	023-65903102	重庆市江津区双福工业园 1 幢 1 号	559	312481	276879	248	687	311	961
3	3	重庆市开州区森美印刷有限公司	15608330060	重庆市开州区云枫街道平桥社区桔乡路 369 号门市	560	313600	364365	360	436	423	838
4	4	吕兴华	13896015224	重庆市璧山区大路街道天星街 23 号	569	323761	63703	376	370	514	262
5	5	重庆海渝包装印刷有限公司	13883451070	重庆市璧山区奥康工业园金剑路 271 号	568	322624	323605	262	360	384	358
6	6	重庆鼎鸿印务有限公司	17782279989	重庆市永川区来龙四路 36 号	597	356409	292462	640	246	938	432
7	7	重庆奔速彩印有限公司	023-62802235	重庆市九龙坡区歇台子渝州路 100 号附 1-5-1 号	561	314721	274268	472	426	537	364
8	8	重庆市永川区木森印刷有限公司	18580704257	重庆市永川区三教镇三川路 216 号	575	330625	361502	62	150	329	996
9	9	重庆梧桐树印务有限公司	023-67980980	重庆市渝中区张家花园街 206 号	580	336400	291369	640	136	62	346

python

print("表长为：", len(df))

表长为： 754

32000 与 8000 的来历

python

print("重复元素最多次数 fen'bie 为：")
for i in range(500):
    flag = 0
    for j in range(800):
        index = 800*i + j
        if(hash_map_tele[index] != 0):
            flag = 1
    if(flag == 0):
        print("tele_i:", i)
        break

for i in range(500):
    flag = 0
    for j in range(800):
        index = 800*i + j
        if(hash_map_name[index] != 0):
            flag = 1
    if(flag == 0):
        print("name_i:", i)
        break

tele_i: 39
name_i: 9

计算 method1 的 ASL

python

c1 = 0  # 统计总的查找次数
for i in range(500):
    for j in range(800):
        index = 800*i + j
        if(hash_map_tele[index] != 0):
            c1 += (i+1)
print('method1-tele-asl:', c1/754)
c2 = 0  # 统计总的查找次数
for i in range(500):
    for j in range(800):
        index = 800*i + j
        if(hash_map_name[index] != 0):
            c2 += (i+1)
print('method1-name-asl:', c2/754)

method1-tele-asl: 12.10079575596817
method1-name-asl: 1.6498673740053051

计算 method2 的 ASL

python

df1 = df.adr_电话_1.value_counts() 
print(df1)

514    39
916    33
329    32
47     31
767    31
187    29
216    29
473    28
619    27
646    26
384    26
62     26
9      25
372    25
175    21
530    20
6      20
852    19
256    18
130    18
780    17
690    17
343    16
917    16
653    15
537    15
685    13
423    12
513    11
891    11
201    10
771    10
311     9
859     7
994     7
93      6
386     5
568     5
938     5
206     4
28      4
688     3
890     3
788     2
119     2
590     1
309     1
752     1
494     1
894     1
434     1
Name: adr_电话_1, dtype: int64

python

df2 = df.adr_企业名称_1.value_counts() 
print(df2)

68      6
90      5
406     5
231     4
979     4
       ..
439     1
768     1
445     1
446     1
1006    1
Name: adr_企业名称_1, Length: 516, dtype: int64

python

print("有多少个元素在 method2-tele 基础表中，即只需查一次：",len(df1))
print("有多少个元素在 method2-name 基础表中，即只需查一次：",len(df2))

有多少个元素在 method2-tele 基础表中，即只需查一次： 51
有多少个元素在 method2-name 基础表中，即只需查一次： 516

python

# 基本表中的元素只需查一次，溢出区的元素和顺序表的查找次数是一样的，所以可以使用等差数列的计算公式进行计算，不过需要注意的就是溢出区的查找次数是从 2 开始的
print("method2-tele-asl:", (51 + (703*2 + (703*702)/2))/754)
print("method2-name-asl:", (516 + (238*2 + (238*237)/2))/754)

method2-tele-asl: 329.19098143236073
method2-name-asl: 38.720159151193634

最终得出结果

method1-tele-asl: 12.10079575596817
method1-name-asl: 1.6498673740053051
method2-tele-asl: 329.19098143236073
method2-name-asl: 38.720159151193634

分析

总的来说，重复元素的个数越多，哈希查找的效率就相应越低而法 1 的开放地址探测法对冲突的解决在本题中查找效率是明显优于公共溢出法的，但相应的占据了更大的存储空间至于平方去中法以及除留余数法在本题中映射出来的地址重复个数基本一致，故不相上下。

散列表实现查找 ​

Library ​

读取数据 ​

get_nms ​

平方取中法 ​

除留余数法 ​

创建哈希表(分别使用开发地址与公共溢出区解决冲突) ​

查找流程(平方取中+开发地址探测法)-method1 ​

查找流程(除留余数法+公共溢出法)-method2 ​

性能分析 ​

32000 与 8000 的来历 ​

计算 method1 的 ASL ​

计算 method2 的 ASL ​

最终得出结果 ​

分析 ​

散列表实现查找

Library

读取数据

get_nms

平方取中法

除留余数法

创建哈希表(分别使用开发地址与公共溢出区解决冲突)

查找流程(平方取中+开发地址探测法)-method1

查找流程(除留余数法+公共溢出法)-method2

性能分析

32000 与 8000 的来历

计算 method1 的 ASL

计算 method2 的 ASL

最终得出结果

分析