[U-Boot] [Patch] Fix hash table deletion to prevent lost entries

Wed Jan 19 17:32:08 CET 2011

>> The hash delete code is in error; instead of just removing the deleted
>> key, it should instead allocate a new hashtable, hash all the keys into
>> the new table except for the deleted key and then reclaim the old table
>> (and deleted key).
> Can you please come up with a patch?

Hashtable delete is broken since freeing entry could break collision
chain for other entries.  Instead free key/data pair and mark entry as 
deleted (via
negative used value) and then skip deleted entries while probing.

Break up hsearch_r into separate henter_r/hfind_r functions for 
readability.
Recycle first deleted entry in probe chain when adding entry to table.
Factor primary/secondary hash calculations into functions and use
in henter_r/hfind_r/hdelete_r.

---
  include/search.h |   16 ++-
  lib/hashtable.c  |  382 
++++++++++++++++++++++++++++++++---------------------
  2 files changed, 241 insertions(+), 157 deletions(-)

diff --git a/include/search.h b/include/search.h
index a7c1293..4466731 100644
--- a/include/search.h
+++ b/include/search.h
@@ -66,12 +66,16 @@ extern int hcreate_r(size_t __nel, struct 
hsearch_data *__htab);
  extern void hdestroy_r(struct hsearch_data *__htab);

  /*
- * Search for entry matching ITEM.key in internal hash table.  If
- * ACTION is `FIND' return found entry or signal error by returning
- * NULL.  If ACTION is `ENTER' replace existing data (if any) with
- * ITEM.data.
- * */
-extern int hsearch_r(ENTRY __item, ACTION __action, ENTRY ** __retval,
+ * Search for entry matching ITEM.key in internal hash table.  Return
+ * found entry or signal error by returning NULL.
+ */
+extern int hfind_r(ENTRY __item, ENTRY ** __retval,
+             struct hsearch_data *__htab);
+/*
+ * Add entry matching ITEM.key in internal hash table.  Replace
+ * existing data (if any) with ITEM.data.
+ */
+extern int henter_r(ENTRY __item, ENTRY ** __retval,
               struct hsearch_data *__htab);

  /*
diff --git a/lib/hashtable.c b/lib/hashtable.c
index 9f069c0..7b5153a 100644
--- a/lib/hashtable.c
+++ b/lib/hashtable.c
@@ -65,7 +65,7 @@
   * which describes the current status.
   */
  typedef struct _ENTRY {
-    unsigned int used;
+    int used;
      ENTRY entry;
  } _ENTRY;

@@ -152,7 +152,7 @@ void hdestroy_r(struct hsearch_data *htab)

      /* free used memory */
      for (i = 1; i <= htab->size; ++i) {
-        if (htab->table[i].used) {
+        if (htab->table[i].used > 0) {
              ENTRY *ep = &htab->table[i].entry;

              free(ep->key);
@@ -165,77 +165,20 @@ void hdestroy_r(struct hsearch_data *htab)
      htab->table = NULL;
  }

-/*
- * hsearch()
- */
-
-/*
- * This is the search function. It uses double hashing with open 
addressing.
- * The argument item.key has to be a pointer to an zero terminated, most
- * probably strings of chars. The function for generating a number of the
- * strings is simple but fast. It can be replaced by a more complex 
function
- * like ajw (see [Aho,Sethi,Ullman]) if the needs are shown.
- *
- * We use an trick to speed up the lookup. The table is created by hcreate
- * with one more element available. This enables us to use the index zero
- * special. This index will never be used because we store the first hash
- * index in the field used where zero means not used. Every other value
- * means used. The used field can be used as a first fast comparison for
- * equality of the stored and the parameter value. This helps to prevent
- * unnecessary expensive calls of strcmp.
- *
- * This implementation differs from the standard library version of
- * this function in a number of ways:
- *
- * - While the standard version does not make any assumptions about
- *   the type of the stored data objects at all, this implementation
- *   works with NUL terminated strings only.
- * - Instead of storing just pointers to the original objects, we
- *   create local copies so the caller does not need to care about the
- *   data any more.
- * - The standard implementation does not provide a way to update an
- *   existing entry.  This version will create a new entry or update an
- *   existing one when both "action == ENTER" and "item.data != NULL".
- * - Instead of returning 1 on success, we return the index into the
- *   internal hash table, which is also guaranteed to be positive.
- *   This allows us direct access to the found hash table slot for
- *   example for functions like hdelete().
- */
-
-int hmatch_r(const char *match, int last_idx, ENTRY ** retval,
-         struct hsearch_data *htab)
-{
-    unsigned int idx;
-    size_t key_len = strlen(match);
-
-    for (idx = last_idx + 1; idx < htab->size; ++idx) {
-        if (!htab->table[idx].used)
-            continue;
-        if (!strncmp(match, htab->table[idx].entry.key, key_len)) {
-            *retval = &htab->table[idx].entry;
-            return idx;
-        }
-    }
-
-    __set_errno(ESRCH);
-    *retval = NULL;
-    return 0;
-}

-int hsearch_r(ENTRY item, ACTION action, ENTRY ** retval,
-          struct hsearch_data *htab)
+/* Primary hash function */
+int hash_primary(ENTRY *item, struct hsearch_data *htab)
  {
+    int len = strlen(item->key);
      unsigned int hval;
      unsigned int count;
-    unsigned int len = strlen(item.key);
-    unsigned int idx;

-    /* Compute an value for the given string. Perhaps use a better 
method. */
+    /* Compute a value for the given string. Perhaps use a better 
method. */
      hval = len;
      count = len;
      while (count-- > 0) {
          hval <<= 4;
-        hval += item.key[count];
+        hval += item->key[count];
      }

      /*
@@ -246,20 +189,95 @@ int hsearch_r(ENTRY item, ACTION action, ENTRY ** 
retval,
      if (hval == 0)
          ++hval;

+    return hval;
+}
+
+/* Secondary hash function */
+int hash_secondary(unsigned int hval, struct hsearch_data *htab)
+{
+    unsigned int hval2;
+
+    /*
+     * Second hash function:
+     * as suggested in [Knuth]
+     */
+    hval2 = 1 + hval % (htab->size - 2);
+    return hval2;
+}
+
+/*
+ * Find an entry in the hashtable. Skip negative used counts; these mark
+ * deleted entries.
+ */
+int hfind_r(ENTRY item, ENTRY ** retval, struct hsearch_data *htab)
+{
+    unsigned int hval;
+    unsigned int idx;
+
      /* The first index tried. */
-    idx = hval;
+    idx = hval = hash_primary(&item, htab);
+
+    debug("hash(%s) = %d/%d; used %d\n", item.key, idx, htab->size, 
htab->table[idx].used);
+
+    do {
+        unsigned int hval2;
+
+        if (htab->table[idx].used == hval
+ && strcmp(item.key, htab->table[idx].entry.key) == 0) {
+            /* return found entry */
+            *retval = &htab->table[idx].entry;
+            return idx;
+        }
+
+        if (!htab->table[idx].used)  {
+            /* end of chain, return empty */
+            __set_errno(ESRCH);
+            *retval = NULL;
+            return 0;
+        }
+
+        hval2 = hash_secondary(hval, htab);

-    if (htab->table[idx].used) {
          /*
-         * Further action might be required according to the
-         * action value.
+         * Because SIZE is prime this guarantees to
+         * step through all available indices.
           */
-        unsigned hval2;
+        if (idx <= hval2)
+            idx = htab->size + idx - hval2;
+        else
+            idx -= hval2;
+
+        /* If we visited all entries leave the loop */
+    } while (idx != hval);
+    __set_errno(ESRCH);
+    *retval = NULL;
+    return 0;
+}
+
+/*
+ * Add/update an entry in the table; search the whole chain
+ * looking for the entry.  Remember first index of deleted entry and
+ * use if found to hold new entry otherwise use empty entry at end of
+ * probe chain
+ */
+int henter_r(ENTRY item, ENTRY ** retval, struct hsearch_data *htab)
+{
+    unsigned int hval;
+    unsigned int idx;
+    int deleted_entry = 0;
+
+    /* The first index tried. */
+    idx = hval = hash_primary(&item, htab);
+
+    debug("hash(%s) = %d/%d; used %d\n", item.key, idx, htab->size, 
htab->table[idx].used);
+
+    do {
+        unsigned int hval2;

          if (htab->table[idx].used == hval
- && strcmp(item.key, htab->table[idx].entry.key) == 0) {
+ && strcmp(item.key, htab->table[idx].entry.key) == 0) {
              /* Overwrite existing value? */
-            if ((action == ENTER) && (item.data != NULL)) {
+            if (item.data != NULL) {
                  free(htab->table[idx].entry.data);
                  htab->table[idx].entry.data =
                      strdup(item.data);
@@ -274,82 +292,122 @@ int hsearch_r(ENTRY item, ACTION action, ENTRY ** 
retval,
              return idx;
          }

-        /*
-         * Second hash function:
-         * as suggested in [Knuth]
-         */
-        hval2 = 1 + hval % (htab->size - 2);
-
-        do {
+        if (htab->table[idx].used < 0) {
+            /* Remember deleted entry to use for inserution */
+            if (!deleted_entry)
+                deleted_entry = idx + 1;
+        } else if (!htab->table[idx].used)  {
              /*
-             * Because SIZE is prime this guarantees to
-             * step through all available indices.
+             * If table is full and another entry should be
+             * entered return with error.
               */
-            if (idx <= hval2)
-                idx = htab->size + idx - hval2;
-            else
-                idx -= hval2;
+            if (htab->filled == htab->size) {
+                __set_errno(ENOMEM);
+                *retval = NULL;
+                return 0;
+            }

              /*
-             * If we visited all entries leave the loop
-             * unsuccessfully.
+             * Create new entry; recyle deleted entry if found
+             * create copies of item.key and item.data
               */
-            if (idx == hval)
-                break;
-
-            /* If entry is found use it. */
-            if ((htab->table[idx].used == hval)
- && strcmp(item.key, htab->table[idx].entry.key) == 0) {
-                /* Overwrite existing value? */
-                if ((action == ENTER) && (item.data != NULL)) {
-                    free(htab->table[idx].entry.data);
-                    htab->table[idx].entry.data =
-                        strdup(item.data);
-                    if (!htab->table[idx].entry.data) {
-                        __set_errno(ENOMEM);
-                        *retval = NULL;
-                        return 0;
-                    }
-                }
-                /* return found entry */
-                *retval = &htab->table[idx].entry;
-                return idx;
+            if (deleted_entry)
+                idx = deleted_entry - 1;
+
+            htab->table[idx].used = hval;
+            htab->table[idx].entry.key = strdup(item.key);
+            htab->table[idx].entry.data = strdup(item.data);
+            if (!htab->table[idx].entry.key ||
+                !htab->table[idx].entry.data) {
+                __set_errno(ENOMEM);
+                *retval = NULL;
+                return 0;
              }
-        }
-        while (htab->table[idx].used);
-    }

-    /* An empty bucket has been found. */
-    if (action == ENTER) {
-        /*
-         * If table is full and another entry should be
-         * entered return with error.
-         */
-        if (htab->filled == htab->size) {
-            __set_errno(ENOMEM);
-            *retval = NULL;
-            return 0;
+            ++htab->filled;
+
+            /* return new entry */
+            *retval = &htab->table[idx].entry;
+            return 1;
          }

+        hval2 = hash_secondary(hval, htab);
+
          /*
-         * Create new entry;
-         * create copies of item.key and item.data
+         * Because SIZE is prime this guarantees to
+         * step through all available indices.
           */
-        htab->table[idx].used = hval;
-        htab->table[idx].entry.key = strdup(item.key);
-        htab->table[idx].entry.data = strdup(item.data);
-        if (!htab->table[idx].entry.key ||
-            !htab->table[idx].entry.data) {
-            __set_errno(ENOMEM);
-            *retval = NULL;
-            return 0;
-        }
+        if (idx <= hval2)
+            idx = htab->size + idx - hval2;
+        else
+            idx -= hval2;
+
+        /* If we visited all entries leave the loop */
+    } while (idx != hval);
+    __set_errno(ESRCH);
+    *retval = NULL;
+    return 0;
+}
+
+
+/*
+ * hsearch()
+ */
+
+int hsearch_r(ENTRY item, ACTION action, ENTRY ** retval,
+          struct hsearch_data *htab)
+{
+    if (action == FIND)
+        return hfind_r(item, retval, htab);
+    else
+        return henter_r(item, retval, htab);
+}
+
+/*
+ * This is the search function. It uses double hashing with open 
addressing.
+ * The argument item.key has to be a pointer to an zero terminated, most
+ * probably strings of chars. The function for generating a number of the
+ * strings is simple but fast. It can be replaced by a more complex 
function
+ * like ajw (see [Aho,Sethi,Ullman]) if the needs are shown.
+ *
+ * We use an trick to speed up the lookup. The table is created by hcreate
+ * with one more element available. This enables us to use the index zero
+ * special. This index will never be used because we store the first hash
+ * index in the field used where zero means not used. Every other 
non-negative
+ * value means used and a negative value means a deleted entry. The 
used field
+ * can be used as a first fast comparison for equality of the stored 
and the
+ * parameter value. This helps to prevent unnecessary expensive calls 
of strcmp.
+ *
+ * This implementation differs from the standard library version of
+ * this function in a number of ways:
+ *
+ * - While the standard version does not make any assumptions about
+ *   the type of the stored data objects at all, this implementation
+ *   works with NUL terminated strings only.
+ * - Instead of storing just pointers to the original objects, we
+ *   create local copies so the caller does not need to care about the
+ *   data any more.
+ * - The standard implementation does not provide a way to update an
+ *   existing entry.  This version will create a new entry or update an
+ *   existing one when both "action == ENTER" and "item.data != NULL".
+ * - Instead of returning 1 on success, we return the index into the
+ *   internal hash table, which is also guaranteed to be positive.
+ *   This allows us direct access to the found hash table slot.
+ */

-        ++htab->filled;
+int hmatch_r(const char *match, int last_idx, ENTRY ** retval,
+         struct hsearch_data *htab)
+{
+    unsigned int idx;
+    size_t key_len = strlen(match);

-        /* return new entry */
-        *retval = &htab->table[idx].entry;
-        return 1;
+    for (idx = last_idx + 1; idx < htab->size; ++idx) {
+        if (htab->table[idx].used <= 0)
+            continue;
+        if (!strncmp(match, htab->table[idx].entry.key, key_len)) {
+            *retval = &htab->table[idx].entry;
+            return idx;
+        }
      }

      __set_errno(ESRCH);
@@ -357,7 +415,6 @@ int hsearch_r(ENTRY item, ACTION action, ENTRY ** 
retval,
      return 0;
  }

-
  /*
   * hdelete()
   */
@@ -365,33 +422,56 @@ int hsearch_r(ENTRY item, ACTION action, ENTRY ** 
retval,
  /*
   * The standard implementation of hsearch(3) does not provide any way
   * to delete any entries from the hash table.  We extend the code to
- * do that.
+ * do that.  This is done by storing a negative value in used.
   */

  int hdelete_r(const char *key, struct hsearch_data *htab)
  {
-    ENTRY e, *ep;
-    int idx;
+    ENTRY e;
+    unsigned int hval;
+    unsigned int idx;

      debug("hdelete: DELETE key \"%s\"\n", key);

      e.key = (char *)key;

-    if ((idx = hsearch_r(e, FIND, &ep, htab)) == 0) {
-        __set_errno(ESRCH);
-        return 0;    /* not found */
-    }
+    idx = hval = hash_primary(&e, htab);
+    do {
+        unsigned int hval2;

-    /* free used ENTRY */
-    debug("hdelete: DELETING key \"%s\"\n", key);
+        if (htab->table[idx].used == hval
+ && strcmp(e.key, htab->table[idx].entry.key) == 0) {
+            /* Delete key/data pair */
+            free(htab->table[idx].entry.key);
+            free(htab->table[idx].entry.data);
+            /* Mark entry as deleted */
+            htab->table[idx].used = -1;
+            --htab->filled;
+            return 1;
+        }

-    free(ep->key);
-    free(ep->data);
-    htab->table[idx].used = 0;
+        if (!htab->table[idx].used)  {
+            /* end of chain, return empty */
+            __set_errno(ESRCH);
+            return 0;
+        }

-    --htab->filled;
+        hval2 = hash_secondary(hval, htab);

-    return 1;
+        /*
+         * Because SIZE is prime this guarantees to
+         * step through all available indices.
+         */
+        if (idx <= hval2)
+            idx = htab->size + idx - hval2;
+        else
+            idx -= hval2;
+
+        /* If we visited all entries leave the loop */
+    } while (idx != hval);
+
+    __set_errno(ESRCH);
+    return 0;
  }

  /*
@@ -467,7 +547,7 @@ ssize_t hexport_r(struct hsearch_data *htab, const 
char sep,
       */
      for (i = 1, n = 0, totlen = 0; i <= htab->size; ++i) {

-        if (htab->table[i].used) {
+        if (htab->table[i].used > 0) {
              ENTRY *ep = &htab->table[i].entry;

              list[n++] = ep;
@@ -703,7 +783,7 @@ int himport_r(struct hsearch_data *htab,
          e.key = name;
          e.data = value;

-        hsearch_r(e, ENTER, &rv, htab);
+        henter_r(e, &rv, htab);
          if (rv == NULL) {
              printf("himport_r: can't insert \"%s=%s\" into hash table\n",
                  name, value);
-- 
1.7.0.4

-- 
Peter Barada
peter.barada at logicpd.com