Asp Forum - PATCH to make internal Hash class retain order...

Thies C. Arntzen

8/12/2006 4:51:00 PM

Hi there -

beeing new to ruby (thru rails) and coming from a strong php background

i really miss beeing able to retain order in a Hash. i know that this
has been discussed in the past i also understand that i can always
write my own OrderedHash implemenation in user-space. -but- as a) i
love to poke around b) userspace was too slow for some of my problems
-and- c) it's always best to to let "code speak" i decided to "scratch
my itch" and wrote a patch (against 1.8.4) that will:

* remember the order in which elements have been added to a Hash (at
the cost of 2 extra pointers in every hash-bucket + 2 extra pointers
per hash-table)
* use this order for iterating (st_foreach will use this order)
* implement a call Hash.order! allows the user to change the internal
order;-)

y = { 'd' => '4', 'a' => '1', 'b' => '2', 'c' => '3' }
puts y.inspect

y.order!
puts y.inspect

y.order! {|a,b| b[1] <=> a[1]}
puts y.inspect

will now get you:
{"d"=>"4", "a"=>"1", "b"=>"2", "c"=>"3"}
{"a"=>"1", "b"=>"2", "c"=>"3", "d"=>"4"}
{"d"=>"4", "c"=>"3", "b"=>"2", "a"=>"1"}

please bear with me as i'm not fully used to all the conventions you
use in the code - also this has only seen minimal testing...

suggestions welcome!

regards, tc

--- snipp --

diff -ru ruby-1.8.4/hash.c ruby-thies/hash.c
--- ruby-1.8.4/hash.c 2005-07-19 10:25:37.000000000 +0200
+++ ruby-thies/hash.c 2006-08-12 17:28:56.000000000 +0200
@@ -1195,6 +1195,46 @@
return entries;
}

+/*
+ * call-seq:
+ * hsh.order! => hash
+ * hsh.order! {|a,b| block } => hash
+ *
+ * Orders <i>hsh</i> according to code <code>[</code> <i>key,
+ * value</i> <code>]</code>. If no block is given the sort order
+ * defaults to the keyorder.
+ *
+ * h = { "b" => 30, "c" => 10, "a" => 20 }
+ * h.order! #=> { "c" => 10, "a" => 20, "b"
=> 30 }
+ * h.order! {|a,b| a[0]<=>b[0]} #=> { "a" => 20, "b" => 30, "c"
=> 10 }
+ *
+ */
+
+static VALUE
+rb_hash_order(hash)
+ VALUE hash;
+{
+ VALUE *keyorder;
+ VALUE entries = rb_hash_to_a(hash);
+ int i;
+
+ rb_ary_sort_bang(entries);
+
+ keyorder = ALLOC_N(VALUE, RARRAY(entries)->len);
+ if (! keyorder) {
+ return 0;
+ }
+
+ for (i = 0; i < RARRAY(entries)->len; i++) {
+ keyorder[i] = RARRAY(RARRAY(entries)->ptr[i])->ptr[0];
+ }
+
+ st_reorder(RHASH(hash)->tbl, keyorder);
+ free(keyorder);
+
+ return hash;
+}
+
static int
inspect_i(key, value, str)
VALUE key, value, str;
@@ -2449,6 +2489,7 @@
rb_define_method(rb_cHash,"each_key", rb_hash_each_key, 0);
rb_define_method(rb_cHash,"each_pair", rb_hash_each_pair, 0);
rb_define_method(rb_cHash,"sort", rb_hash_sort, 0);
+ rb_define_method(rb_cHash,"order!", rb_hash_order, 0);

rb_define_method(rb_cHash,"keys", rb_hash_keys, 0);
rb_define_method(rb_cHash,"values", rb_hash_values, 0);
diff -ru ruby-1.8.4/rubytest.rb ruby-thies/rubytest.rb
--- ruby-1.8.4/rubytest.rb 2005-02-06 18:03:32.000000000 +0100
+++ ruby-thies/rubytest.rb 2006-08-12 15:04:15.000000000 +0200
@@ -42,6 +42,7 @@
print "test succeeded\n"
exit 0
end
+ puts line
error << line if line =~ %r:^(sample/test.rb|not):
end
print error
Only in ruby-thies: signal.o
Only in ruby-thies: sprintf.o
diff -ru ruby-1.8.4/st.c ruby-thies/st.c
--- ruby-1.8.4/st.c 2005-12-20 05:13:21.000000000 +0100
+++ ruby-thies/st.c 2006-08-12 18:05:55.000000000 +0200
@@ -13,11 +13,25 @@

typedef struct st_table_entry st_table_entry;

+#define ST_APPEND_TO_GLOBAL_ORDER(tbl, entry)+ (entry)->gnext = NULL;+ (entry)->gprev = (tbl)->tail;+ if ((entry)->gprev) {+ (entry)->gprev->gnext = (entry);+ }+ (tbl)->tail = (entry);+ if (! (tbl)->head) {+ (tbl)->head = (entry);+ }
+
+
struct st_table_entry {
unsigned int hash;
st_data_t key;
st_data_t record;
st_table_entry *next;
+ st_table_entry *gprev;
+ st_table_entry *gnext;
};

#define ST_DEFAULT_MAX_DENSITY 5
@@ -135,6 +149,52 @@
}
#endif

+static int
+st_unlink_entry(table, entry)
+ st_table *table;
+ st_table_entry *entry;
+{
+ int i;
+ st_table_entry *last, *ptr;
+
+ for (i = 0; i < table->num_bins; i++) {
+ last = NULL;
+ ptr = table->bins[i];
+ while (ptr) {
+ if (ptr == entry) break; /* found */
+ last = ptr;
+ ptr = ptr->next;
+ }
+ if (! ptr) continue; /* correct bin it yet to be found */
+
+ if (last) {
+ last->next = ptr->next;
+ } else {
+ table->bins[i] = ptr->next;
+ }
+
+ if (table->head == ptr) {
+ table->head = ptr->gnext;
+ }
+ if (table->tail == ptr) {
+ table->tail = ptr->gprev;
+ }
+
+ if (ptr->gnext) {
+ ptr->gnext->gprev = ptr->gprev;
+ }
+
+ if (ptr->gprev) {
+ ptr->gprev->gnext = ptr->gnext;
+ }
+
+ table->num_entries--;
+ return 1;
+ }
+ return 0;
+}
+
+
st_table*
st_init_table_with_size(type, size)
struct st_hash_type *type;
@@ -157,6 +217,8 @@
tbl->num_bins = size;
tbl->bins = (st_table_entry **)Calloc(size,
sizeof(st_table_entry*));

+ tbl->head = tbl->tail = NULL;
+
return tbl;
}

@@ -269,6 +331,7 @@
entry->record = value; entry->next = table->bins[bin_pos]; table->bins[bin_pos] = entry;+ ST_APPEND_TO_GLOBAL_ORDER(table, entry) table->num_entries++; } while (0)

@@ -339,39 +402,74 @@
{
st_table *new_table;
st_table_entry *ptr, *entry;
- int i, num_bins = old_table->num_bins;
+ int num_bins = old_table->num_bins;
+ unsigned int bin_pos;

- new_table = alloc(st_table);
+ new_table = st_init_table_with_size(old_table->type,
old_table->num_bins);
if (new_table == 0) {
return 0;
}

- *new_table = *old_table;
- new_table->bins = (st_table_entry**)
- Calloc((unsigned)num_bins, sizeof(st_table_entry*));
+ ptr = old_table->head;
+ while (ptr) {
+ entry = alloc(st_table_entry);
+ if (! entry) {
+ st_free_table(new_table); /* safe as new_table is consistent */
+ return 0;
+ }
+ *entry = *ptr;
+ bin_pos = entry->hash % num_bins;
+ entry->next = new_table->bins[bin_pos];
+ new_table->bins[bin_pos] = entry;
+ ST_APPEND_TO_GLOBAL_ORDER(new_table, entry);
+ new_table->num_entries++;

- if (new_table->bins == 0) {
- free(new_table);
- return 0;
+ ptr = ptr->gnext;
}

- for(i = 0; i < num_bins; i++) {
- new_table->bins[i] = 0;
- ptr = old_table->bins[i];
- while (ptr != 0) {
- entry = alloc(st_table_entry);
- if (entry == 0) {
- free(new_table->bins);
- free(new_table);
- return 0;
- }
- *entry = *ptr;
- entry->next = new_table->bins[i];
- new_table->bins[i] = entry;
- ptr = ptr->next;
+ return new_table;
+}
+
+void
+st_reorder(table, keylist)
+ st_table *table;
+ st_data_t *keylist;
+{
+ st_table buf_table, *org_table;
+ st_table_entry *entry;
+ st_data_t key;
+ unsigned int i, hash_val, bin_pos;
+
+ /*
+ * "algorithmus":
+ * 1 remeber old table content
+ * 2 reinitialize old_table to be empty
+ * 3 refill old table in corrent order (supplied by keylist) using

the old content
+ */
+
+ buf_table = *table;
+ org_table = &buf_table;
+
+ table->num_entries = 0;
+ table->bins = (st_table_entry **)Calloc(table->num_bins,
sizeof(st_table_entry*));
+ table->head = table->tail = NULL;
+
+ for (i = 0; i < org_table->num_entries; i++) {
+ key = keylist[i];
+ hash_val = do_hash(key, org_table);
+ FIND_ENTRY(org_table, entry, hash_val, bin_pos);
+ if (! entry) {
+ /* should not happen */
+ printf("uoooo...\n");
}
+
+ entry->next = table->bins[bin_pos];
+ table->bins[bin_pos] = entry;
+ ST_APPEND_TO_GLOBAL_ORDER(table, entry);
+ table->num_entries++;
}
- return new_table;
+
+ free(org_table->bins);
}

int
@@ -381,39 +479,26 @@
st_data_t *value;
{
unsigned int hash_val;
- st_table_entry *tmp;
register st_table_entry *ptr;

hash_val = do_hash_bin(*key, table);
ptr = table->bins[hash_val];

- if (ptr == 0) {
- if (value != 0) *value = 0;
- return 0;
+ while (ptr) {
+ if (EQUAL(table, ptr->key, *key)) break;
+ ptr = ptr->next;
}

- if (EQUAL(table, *key, ptr->key)) {
- table->bins[hash_val] = ptr->next;
- table->num_entries--;
+ if (ptr) {
+ st_unlink_entry(table, ptr);
if (value != 0) *value = ptr->record;
*key = ptr->key;
free(ptr);
return 1;
+ } else {
+ if (value != 0) *value = 0;
+ return 0;
}
-
- for(; ptr->next != 0; ptr = ptr->next) {
- if (EQUAL(table, ptr->next->key, *key)) {
- tmp = ptr->next;
- ptr->next = ptr->next->next;
- table->num_entries--;
- if (value != 0) *value = tmp->record;
- *key = tmp->key;
- free(tmp);
- return 1;
- }
- }
-
- return 0;
}

int
@@ -429,22 +514,21 @@
hash_val = do_hash_bin(*key, table);
ptr = table->bins[hash_val];

- if (ptr == 0) {
- if (value != 0) *value = 0;
- return 0;
+ while (ptr) {
+ if ((ptr->key != never) && EQUAL(table, ptr->key, *key)) break;
+ ptr = ptr->next;
}

- for(; ptr != 0; ptr = ptr->next) {
- if ((ptr->key != never) && EQUAL(table, ptr->key, *key)) {
- table->num_entries--;
- *key = ptr->key;
- if (value != 0) *value = ptr->record;
- ptr->key = ptr->record = never;
- return 1;
- }
+ if (ptr) {
+ table->num_entries--;
+ if (value != 0) *value = ptr->record;
+ *key = ptr->key;
+ free(ptr);
+ return 1;
+ } else {
+ if (value != 0) *value = 0;
+ return 0;
}
-
- return 0;
}

static int
@@ -472,46 +556,40 @@
int (*func)();
st_data_t arg;
{
- st_table_entry *ptr, *last, *tmp;
+ st_table_entry *ptr, *tmp;
enum st_retval retval;
- int i;
-
- for(i = 0; i < table->num_bins; i++) {
- last = 0;
- for(ptr = table->bins[i]; ptr != 0;) {
- retval = (*func)(ptr->key, ptr->record, arg);
- switch (retval) {
- case ST_CHECK: /* check if hash is modified during iteration */
- tmp = 0;
- if (i < table->num_bins) {
- for (tmp = table->bins[i]; tmp; tmp=tmp->next) {
- if (tmp == ptr) break;
- }
- }
- if (!tmp) {
- /* call func with error notice */
- return 1;
- }
- /* fall through */
- case ST_CONTINUE:
- last = ptr;
- ptr = ptr->next;
- break;
- case ST_STOP:
- return 0;
- case ST_DELETE:
- tmp = ptr;
- if (last == 0) {
- table->bins[i] = ptr->next;
- }
- else {
- last->next = ptr->next;
- }
- ptr = ptr->next;
- free(tmp);
- table->num_entries--;
- }
- }
+
+ ptr = table->head;
+ while (ptr) {
+ retval = (*func)(ptr->key, ptr->record, arg);
+ switch (retval) {
+ case ST_CHECK: /* check if hash is modified during iteration
*/
+ tmp = table->head;
+ while (tmp) {
+ if (tmp == ptr) break; /* current element is still
there - good */
+ tmp = tmp->gnext;
+ }
+ if (! tmp) {
+ /* call func with error notice */
+ return 1;
+ }
+ /* fall through */
+ case ST_CONTINUE:
+ ptr = ptr->gnext;
+ break;
+ case ST_STOP:
+ return 0;
+ case ST_DELETE:
+ tmp = ptr;
+ ptr = ptr->gnext;
+ st_unlink_entry(table, tmp);
+ free(tmp);
+ break;
+ default:
+ /* should never happen! */
+ printf("uhh? retval=%d\n", retval);
+ exit(-10);
+ }
}
return 0;
}
diff -ru ruby-1.8.4/st.h ruby-thies/st.h
--- ruby-1.8.4/st.h 2005-02-08 01:50:33.000000000 +0100
+++ ruby-thies/st.h 2006-08-12 17:51:50.000000000 +0200
@@ -21,6 +21,8 @@
int num_bins;
int num_entries;
struct st_table_entry **bins;
+ struct st_table_entry *head;
+ struct st_table_entry *tail;
};

#define st_is_member(table,key) st_lookup(table,key,(st_data_t *)0)
@@ -53,6 +55,7 @@
void st_free_table _((st_table *));
void st_cleanup_safe _((st_table *, st_data_t));
st_table *st_copy _((st_table *));
+void st_reorder _((st_table *, st_data_t *));

#define ST_NUMCMP ((int (*)()) 0)
#define ST_NUMHASH ((int (*)()) -2)

20 Answers

ts

8/12/2006 5:07:00 PM

>>>>> "T" == Thies C Arntzen <thieso@gmail.com> writes:

T> i really miss beeing able to retain order in a Hash. i know that this
T> has been discussed in the past i also understand that i can always
T> write my own OrderedHash implemenation in user-space. -but- as a) i
T> love to poke around b) userspace was too slow for some of my problems
T> -and- c) it's always best to to let "code speak" i decided to "scratch
T> my itch" and wrote a patch (against 1.8.4) that will:

Why you don't first write a C extension, rather than modifying ruby ?

--

Guy Decoux

Brian McCallister

8/12/2006 5:53:00 PM

On Aug 12, 2006, at 9:55 AM, Thies C. Arntzen wrote:

> Hi there -
>
> beeing new to ruby (thru rails) and coming from a strong php
> background

<snip />

> * remember the order in which elements have been added to a Hash (at
> the cost of 2 extra pointers in every hash-bucket + 2 extra pointers
> per hash-table)
> * use this order for iterating (st_foreach will use this order)
> * implement a call Hash.order! allows the user to change the internal
> order;-)

Nice :-) The better place for discussion hacking on the ruby
interpreter itself is probably ruby-core.

How difficult would it be to do an OrderedHash data structure as a C
extension instead of in Ruby?

-Brian

Thies C. Arntzen

8/12/2006 6:32:00 PM

Brian McCallister schrieb:

> On Aug 12, 2006, at 9:55 AM, Thies C. Arntzen wrote:
>
> > Hi there -
> >
> > beeing new to ruby (thru rails) and coming from a strong php
> > background
>
> <snip />
>
> > * remember the order in which elements have been added to a Hash (at
> > the cost of 2 extra pointers in every hash-bucket + 2 extra pointers
> > per hash-table)
> > * use this order for iterating (st_foreach will use this order)
> > * implement a call Hash.order! allows the user to change the internal
> > order;-)
>
> Nice :-) The better place for discussion hacking on the ruby
> interpreter itself is probably ruby-core.

i wasn't aware of that list;-) subscribed...

>
> How difficult would it be to do an OrderedHash data structure as a C
> extension instead of in Ruby?

i really believe that it needs to be a 1st class citizen - the reason
it's done in C is because userland imposes too much overhead (i
believe) and converting forward<->back to the normal Hash would be
slow, memory intensive and ordering information would get lost on the
way...

also the amount of code added to the Hash class is ~20 lines of C-code
- this code adds zero overhead. the bigger chunk of the code was added
to the st.c which is the underlying library for handling hash-tables -
and duplicating > 600 lines of code to add ~60 lines didn't seem very
cool to me;-)

best, thies

dblack

8/12/2006 6:46:00 PM

Robert Klemme

8/12/2006 10:20:00 PM

Thies C. Arntzen wrote:
> Brian McCallister schrieb:
>
>> On Aug 12, 2006, at 9:55 AM, Thies C. Arntzen wrote:
>>
>>> Hi there -
>>>
>>> beeing new to ruby (thru rails) and coming from a strong php
>>> background
>> <snip />
>>
>>> * remember the order in which elements have been added to a Hash (at
>>> the cost of 2 extra pointers in every hash-bucket + 2 extra pointers
>>> per hash-table)
>>> * use this order for iterating (st_foreach will use this order)
>>> * implement a call Hash.order! allows the user to change the internal
>>> order;-)
>> Nice :-) The better place for discussion hacking on the ruby
>> interpreter itself is probably ruby-core.
>
> i wasn't aware of that list;-) subscribed...
>
>> How difficult would it be to do an OrderedHash data structure as a C
>> extension instead of in Ruby?
>
> i really believe that it needs to be a 1st class citizen, - the reason
> it's done in C is because userland imposes too much overhead (i
> believe) and converting forward<->back to the normal Hash would be
> slow, memory intensive and ordering information would get lost on the
> way...
>
> also the amount of code added to the Hash class is ~20 lines of C-code
> - this code adds zero overhead. the bigger chunk of the code was added
> to the st.c which is the underlying library for handling hash-tables -
> and duplicating > 600 lines of code to add ~60 lines didn't seem very
> cool to me;-)

Adding to David's comments: you're changing semantics of one of the core
classes which is in itself a dangerous thing. Plus, you increase the
memory footprint of a hash which is IMHO a very bad idea for such a
widely used data structure (and some of these hashes grow huge). So, if
you want to patch ruby core, make it at least a subclass of Hash so
people can choose.

Then of course there's a whole bunch of other things that one should
consider. Is the name properly chosen? Do we have to provide a means
for the user to *define* the standard order on creation? This is e.g.
the way Java's TreeSet works - and IMHO it's superior vs. having to call
order! manually. etc.

Kind regards

robert

Trans

8/12/2006 11:01:00 PM

Robert Klemme wrote:

> Adding to David's comments: you're changing semantics of one of the core
> classes which is in itself a dangerous thing. Plus, you increase the
> memory footprint of a hash which is IMHO a very bad idea for such a
> widely used data structure (and some of these hashes grow huge). So, if
> you want to patch ruby core, make it at least a subclass of Hash so
> people can choose.
>
> Then of course there's a whole bunch of other things that one should
> consider. Is the name properly chosen? Do we have to provide a means
> for the user to *define* the standard order on creation? This is e.g.
> the way Java's TreeSet works - and IMHO it's superior vs. having to call
> order! manually. etc.

All good points. I still think it would be nice if such a class were
built-in to Ruby with literal:

[ :a => 1, :b => 2 ]

Maybe as a red-black tree class.

T.

Hal E. Fulton

8/12/2006 11:44:00 PM

ts wrote:
>>>>>>"T" == Thies C Arntzen <thieso@gmail.com> writes:
>
>
> T> i really miss beeing able to retain order in a Hash. i know that this
> T> has been discussed in the past i also understand that i can always
> T> write my own OrderedHash implemenation in user-space. -but- as a) i
> T> love to poke around b) userspace was too slow for some of my problems
> T> -and- c) it's always best to to let "code speak" i decided to "scratch
> T> my itch" and wrote a patch (against 1.8.4) that will:
>
> Why you don't first write a C extension, rather than modifying ruby ?
>

Interesting question... do you mean a C extension implementing a
different Hash class or what?

Hal

Hal E. Fulton

8/12/2006 11:46:00 PM

Brian McCallister wrote:
>
> On Aug 12, 2006, at 9:55 AM, Thies C. Arntzen wrote:
>
>> Hi there -
>>
>> beeing new to ruby (thru rails) and coming from a strong php background
>
>
> <snip />
>
>> * remember the order in which elements have been added to a Hash (at
>> the cost of 2 extra pointers in every hash-bucket + 2 extra pointers
>> per hash-table)
>> * use this order for iterating (st_foreach will use this order)
>> * implement a call Hash.order! allows the user to change the internal
>> order;-)
>
>
> Nice :-) The better place for discussion hacking on the ruby
> interpreter itself is probably ruby-core.
>
> How difficult would it be to do an OrderedHash data structure as a C
> extension instead of in Ruby?

Well, if it were another class, then you would lose the order
information before you ever got into that class, right?

hash = {a=>1, b=>2, c=>3}

ordered = OrderedHash.new(hash) # Oops -- too late

Hal

Hal E. Fulton

8/12/2006 11:48:00 PM

dblack@wobblini.net wrote:
>
> The problem you likely face, though, is that no one will use it (which
> may or may not matter to you :-) You're requiring that people use a
> patched Ruby, and if anyone writes an application that uses it, then
> they require that everyone who runs the application use a patched
> Ruby, and so on.
>
> It's much more likely -- probably literally thousands of times more
> likely -- that people will try out something distributed as an
> extension than that they'll patch the Ruby source. And C extensions
> are very fast, once loaded.

My impression was that this was for play and for testing
prior to possibly being folded into Ruby.

Also, I don't yet see how this could be implemented as a
C extension. See my previous email.

Hal

dblack

8/12/2006 11:52:00 PM

comp.lang.ruby

PATCH to make internal Hash class retain order...

Thies C. Arntzen

ts

Brian McCallister

Thies C. Arntzen

dblack

Robert Klemme

Trans

Hal E. Fulton

Hal E. Fulton

Hal E. Fulton

dblack

x Login to ForumsZone