简单介绍了Hash及其相关的概念,简单应用Hash编写一个词典程序,并对词典的查找效率进行评估.

什么是哈希?

Hash,可以翻译为散列,也可音译为哈希,下文统一使用Hash.
http://c.biancheng.net/view/3437.html这个教程很好的介绍了Hash的原理.

一个Hash表的简单实现

键值对

template<class Key, class T>//模板列表,一个是类型Key(键),一个是类型 T(值).
struct key_value_pair//键值对
{
    key_value_pair(const key_value_pair& p) : first(p.first)
    {
        second = p.second;
    }
    //构造函数,参数是一个已有键值对的引用. 使用初始化列表来对first进行初始化,然后对second进行单独的赋值,本质上是等价于
    /*
    key_value_pair(const key_value_pair& p)
    {
        first = p.first;
        second = p.second;
    }
    */
    //且,成员变量的赋值顺序是由它们在类中的声明顺序决定的.

    key_value_pair(const Key& key, const T& t) : first(key)
    {
        second = t;
    }
    //第二个构造函数,参数是常量Key的引用,和常量t的引用,同样是初始化列表对first赋值为key,之后在函数体当中对second赋值为t.

    bool operator== (const key_value_pair& x)
    {
        return first == x.first;
    }
    //操作符重载,键值对a等于键值对b,其效果令为first值相同

    const Key first;
    T second;
};

HashMap

模板列表

1	template<class Key, class T, class HashFunc>

这个类中,有三个模板,Key,T和HasFunc

类型重新定义

1	typedef typename list<key_value_pair<const Key, T> >::iterator hash_list_itr;

把list<key_value_pair<const Key, T> >的这个类里面的迭代器,重新命名为hash_list_itr

常量

const static int DEFAULT_SIZE; //默认的大小尺寸
const static float MAX_RATIO; //最大的比率,比率与实际大小相乘为可存储的最大容量,超过以后会扩容

list <key_value_pair<const Key, T> > *buckets;
//一个list指针用来指向存储已经定义好的键值对的list容器(名字为buckets)

int count,   // 哈希表里值的数量
    length;  // 哈希表里buckets的数量

HashFunc hash;//实例化HashFunc

template<class Key, class T, class HashFunc>
const int hash_map<Key, T, HashFunc>::DEFAULT_SIZE = 211;

template<class Key, class T, class HashFunc>
const float hash_map<Key, T, HashFunc>::MAX_RATIO = 0.75;

构造函数

hash_map()
    {
        count = 0;
        length = DEFAULT_SIZE;
        buckets = new list<key_value_pair<const Key, T> >[];
    }

默认构造器,给count赋值为0,length赋值了常量DEFAULT_SIZE = 211.
而buckets是一个list,存储了键值对,键值对的两个数据类型是Key和T,长度为DEFAULT_SIZE.

void check_for_expansion()函数

该函数判断当前的哈希图是否需要扩容

void check_for_expansion()
    {
        hash_list_itr list_itr;//创建迭代器 list_itr
        int address;//声明int类型变量address

        if (count > int(MAX_RATIO * length))//判断是否满足扩容条件,即已有键值对达到总共键值对*最大允许的比率
        {
            list< key_value_pair<const Key, T> >* temp_buckets = buckets;
            //创建一个新的键值对链表,和旧的指向同一链表

            length = 2 * length + 1;//确保旧的链表长度是小于新的length的一半的.
            buckets = new list< key_value_pair< const Key, T> >[length];//buckets重新申请一块新的内存,大小为扩容以后的length
            Key new_key;//声明一个新的Key值

            for (int i = 0; i < length / 2; i++)//深拷贝,旧的链表长度是小于新的length的一半的.
                if (!temp_buckets[i].empty())//只要当前节点的键值对不空,开始第一个内循环
                    for (list_itr = temp_buckets[i].begin();
                        list_itr != temp_buckets[i].end();
                        list_itr++)
                        //迭代器从旧的链表开始循环,到最后一个节点停下
                    {
                        new_key = (*list_itr).first;//把迭代器的键值赋值给new_key
                        address = hash(new_key) % length;//调用哈希函数,之后对新的长度取余,确保新地址是小于length的.
                        buckets[address].push_back(*list_itr);//在list的address位置后面插入list_itr指向的键值对.
                        //问题是,new_key的意义何在?
                    } // posting temp_buckets [i] back into buckets
            for (int i = 0; i < length / 2; i++)
                if (!temp_buckets[i].empty())
                    temp_buckets[i].erase(temp_buckets[i].begin(),
                        temp_buckets[i].end());
            //该循环是temp_buckets中的旧的hash表删去

        } // doubling buckets size
    }

操作符重载”==”

bool operator== (const hash_map<Key, T, HashFunc>& other)//参数为另一个hash_map
    {
        return (buckets == other.buckets) &&
            (count == other.count) &&
            (length == other.length);//当两个hash_map的buckets,count和length都相等,返回true,否则false
    }

int size()函数

int size()
    {
        return count;
    }

这个函数直接返回count即可

迭代器

friend class hash_map<Key, T, HashFunc>;
首先是hash_map的友元

成员变量

1
2
3

unsigned index, length;//地址和长度
hash_list_itr list_itr;//list迭代器
list<key_value_pair<const Key, T> > *buckets;//指向list<键值对>的指针 buckets

操作符重载-“==,!=,++,*,[]”

bool operator== (const iterator& other) const
        {
            return (index == other.index) &&
                (list_itr == other.list_itr) &&
                (buckets == other.buckets) &&
                (length == other.length);
        } // ==
        //四个成员变量全部相同的时候返回true

        bool operator!= (const iterator& other) const
        {
            return !(*this == other);
        } // operator!=
        //代码复用,这里复用了操作符"=="

        iterator operator++ (int)
        {
            iterator old_itr = *this;//old_itr与当前迭代器指向同一个hash_map
            if (++list_itr != buckets[index].end())//判断,如果itr自增以后不等于最后一个元素,则返回old_itr
                return old_itr;
            while (index < length - 1)
            {
                index++;
                if (!buckets[index].empty())
                {
                    list_itr = buckets[index].begin();
                    return old_itr;
                } // bucket entry not empty
            }// while
            list_itr = buckets[index].end();
            return old_itr;
        } // postincrement operator++
        //

        key_value_pair<const Key, T>& operator* ()
        {
            return *list_itr;
        } // operator*

begin()函数

// Postcondition: an iterator positioned at the beginning
    //                of this hash_map has been returned.
    iterator begin()
    {
        int i = 0;
        iterator itr;//创建了要返回的itr

        itr.buckets = buckets; //friend class
        //新的迭代器的buckets指针指向旧的
        while (i < length - 1)
        {
            if (!buckets[i].empty())
            {
                itr.list_itr = buckets[i].begin();
                itr.index = i;
                itr.length = length;
                return itr;
            } // buckets [i] not empty
            i++;
        } // while循环,只要buckets[i]为空,就i++,不空则令指针指向buckets的第一个元素,index=i. 这个循环的目的在于找到当前列表的buckets中最后一个元素的地址
        itr.list_itr = buckets[i].end();//循环结束如果没有找到,则说明这个buckets是满的,直接指向end即可.
        itr.index = i;
        itr.length = length;
        return itr;
    }

返回的迭代器指向hash_map的开始的位置,但是迭代器内部的index变量是最后一个元素的地址.

end()函数

// Postcondition: an iterator has been returned that can be used
    //                in comparisons to terminate iterating through
    //                this hash_map.
    iterator end()
    {
        int i = length - 1;
        iterator itr;

        itr.buckets = buckets;
        itr.list_itr = buckets[i].end();
        itr.index = i;
        itr.length = length;
        return itr;
    } // end

该函数返回一个迭代器,迭代器里的list_itr指向buckets的最后一个元素

insert()函数

pair<iterator, bool> insert(const key_value_pair<const Key, T>& x)
    {
        iterator old_itr;
        Key key = x.first;

        old_itr = find(key);
        if (old_itr != end())
            return pair <iterator, bool>(old_itr, false);
        int address = hash(key) % length;
        buckets[address].push_back(x);
        count++;
        check_for_expansion();
        return pair<iterator, bool>(find(key), true);
    }

首先,参数是一个键值对x. 函数首先分析这个键值对的key值是否已经存在,如果存在,返回键值对<迭代器,布尔值>,迭代器指向哈希表中的这个key值,布尔值为false.
如果没有,则在哈希表中创建该键值对.

find()函数

iterator find(const Key& key)
   {
       int address = hash(key) % length;
       iterator itr;
       itr.list_itr = std::find(buckets[address].begin(),
           buckets[address].end(),
           key_value_pair<const Key, T>(key, T()));
       if (itr.list_itr == buckets[address].end())
           return end();
       itr.buckets = buckets;
       itr.index = address;
       itr.length = length;
       return itr;
   }

在哈希表里面寻找是否已经存在和参数key值相同的key值

erase()函数

void erase(iterator itr)
    {
        buckets[itr.index].erase(itr.list_itr);
        count--;
    }

调用list类里面的erase.

析构函数

~hash_map()
   {
       delete[] buckets;
   }

实验部分

drive.h

hash_func

1	unsigned long operator() (const string key);

对()操作符进行重载,参数是string类型的变量key.

HashMapDriver类

class HashMapDriver
{
protected:

    fstream inFile; //声明fstream类inFile

    hash_map <string, string, hash_func> dict; 
    //创建一个叫做dict(词典)的哈希表,他的模板模板变元中,属于键值对的两个模板变元都是string,HashFunc模板变元则是hash_func对象.

public:

    void setUpFiles();
    //打开存储词汇表的文本文件以供读取
   
    void readAndProcess();
    //重复读取词汇表文本文件的每一行，同时构建英文词典，该词典以hash map方式存储

    void testDictionary(); 
    //从用户键盘输入读取一个单词，在词典中查找并返回中文释义
}; // class HashMapDriver

实验要求

实现readAndProcess() 和 testDictionary()
设计实现哈希函数类hash_func
对你的词典进行效率测试.修改hash map类以获取你所实现的英文词典的如下信息：
1. 装填因子，即存储单词的数目/散列表数组的长度
2. 所有IELTS词汇表中的单词在查找时的比较次数的最大值和平均值
  上述两个指标可以在一定程度上反映你所实现的英文词典的查找效率。