Group Similar Dict Entries As A Tuple Of Keys
I would like to group similar entries of a dataset. ds = {1: 'foo', 2: 'bar', 3: 'foo', 4: 'bar', 5: 'foo'} >>>tupelize_dict(ds) { (1,3,5): 'fo
Solution 1:
something like this should do the trick:
>>> from collections import defaultdict
>>> ds = {1: 'foo',
... 2: 'bar',
... 3: 'foo',
... 4: 'bar',
... 5: 'foo'}
>>>
>>> d = defaultdict(list)
>>> for k, v in ds.items():
... d[v].append(k)
...
>>> res = {tuple(v): k for k, v in d.items()}
>>> res
{(1, 3, 5): 'foo', (2, 4): 'bar'}
Solution 2:
as well as you could do something like this.
def tupelize_dict(ds):
cache = {}
for key, value in ds.items():
cache.setdefault(value, []).append(key)
return {tuple(v): k for k, v in cache.items()}
ds = {1: 'foo',
2: 'bar',
3: 'foo',
4: 'bar',
5: 'foo'}
print(tupelize_dict(ds))
Solution 3:
Following the answer of acushner, it is possible to make it work if I can compute a hash of the content of dataset's elements.
import pickle
from collections import defaultdict
def tupelize_dict(ds):
t = {}
d = defaultdict(list)
for k, v in ds.items():
h = dumps(ds)
t[h] = v
d[h].append(k)
return {tuple(v): t[k] for k, v in d.items()}
This solution is MUCH faster than my original proposition.
To test it I made a set of big random nested dictionary and run cProfile
on both implementations:
original: 204.9 seconds
new: 6.4 seconds
EDIT:
I realized the dumps
does not work with some dictionaries because the keys order can internally vary for obscure reasons (see this question)
A workaround would be to order all the dicts:
import copy
import collections
def faithfulrepr(od):
od = od.deepcopy(od)
if isinstance(od, collections.Mapping):
res = collections.OrderedDict()
for k, v in sorted(od.items()):
res[k] = faithfulrepr(v)
return repr(res)
if isinstance(od, list):
for i, v in enumerate(od):
od[i] = faithfulrepr(v)
return repr(od)
return repr(od)
def tupelize_dict(ds):
taxonomy = {}
binder = collections.defaultdict(list)
for key, value in ds.items():
signature = faithfulrepr(value)
taxonomy[signature] = value
binder[signature].append(key)
def tu(keys):
return tuple(sorted(keys)) if len(keys) > 1 else keys[0]
return {tu(keys): taxonomy[s] for s, keys in binder.items()}
Post a Comment for "Group Similar Dict Entries As A Tuple Of Keys"