Source code for hypernets.tabular.data_hasher

# -*- coding:utf-8 -*-
"""

"""
import hashlib
import pickle
from io import BytesIO

import numpy as np
import pandas as pd
from pandas.util import hash_pandas_object


[docs]class DataHasher: def __init__(self, method='md5'): self.method = method def __call__(self, data): m = getattr(hashlib, self.method)() for x in self._iter_data(data): m.update(x) return m.hexdigest() def _iter_data(self, data): yield self._qname(type(data)).encode('utf-8') if data is None: yield b'<None>' elif isinstance(data, pd.DataFrame): # Fix: TypeError: unhashable type: 'Series' in case of pd.Series in pd.Series hashable = [] for column in data.columns: data_series = data[column] first_item = data_series[:1].tolist()[0] if isinstance(first_item, pd.Series): for item in data_series: if isinstance(item, pd.Series): yield from self._iter_data(item) else: hashable.append(column) if len(hashable) > 0: yield from self._iter_pd_dataframe(data[hashable]) elif isinstance(data, pd.Series): yield from self._iter_pd_dataframe(data.to_frame()) elif isinstance(data, np.ndarray): yield from self._iter_ndarray(data) elif isinstance(data, (bytes, bytearray)): yield data elif isinstance(data, str): yield data.encode('utf-8') elif isinstance(data, (list, tuple)): for x in data: yield from self._iter_data(x) elif isinstance(data, dict): for k, v in data.items(): yield from self._iter_data(k) yield b'=' yield from self._iter_data(v) else: buf = BytesIO() pickle.dump(data, buf, protocol=pickle.HIGHEST_PROTOCOL) yield buf.getvalue() buf.close() @staticmethod def _qname(cls): return f'{cls.__module__}.{cls.__name__}' @staticmethod def _hash_pd_dataframe(df): return hash_pandas_object(df, index=False) @staticmethod def _hash_ndarray(arr): if arr.shape[0] == 0: v = np.array([], dtype='u8').reshape((-1, 1)) else: v = hash_pandas_object(pd.DataFrame(arr), index=False).values.reshape((-1, 1)) return v @classmethod def _iter_pd_dataframe(cls, df): # for col in df.columns: # yield str(col).encode() yield ','.join(map(str, df.columns.tolist())).encode('utf-8') yield cls._hash_pd_dataframe(df).values @classmethod def _iter_ndarray(cls, arr): yield cls._hash_ndarray(arr)