I'm trying to subclass an np.ndarray
with the following constraints:
- the subclass has to be memory mapped
- is has to be pickleable while keeping the underlying buffer memory mapped.
I was able to implement the subclass, however, when pickling the object, the buffer is copied into the pickled object, so loading the pickled object means loading the underlying memmap to memory.
Here's the snippet:
import numpy as np
from pathlib import Path
import pickle
import mmap
class PersistedArray(np.ndarray):
suffix: str = 'mmap'
__array_priority__ = 1.0
def __new__(cls, input_array, name: str, folder: Path):
folder.mkdir(parents=True, exist_ok=True)
path = folder / f'{name}.{cls.suffix}'
mode = 'r+' if path.is_file() else 'w+'
obj = np.memmap(path, dtype=input_array.dtype, shape=input_array.shape, mode=mode).view(cls)
obj[:] = input_array[:]
obj.folder = folder
obj.name = name
obj.path = path
obj.meta_path = obj.path.with_suffix('.p')
meta_json = {'dtype': obj.dtype, 'shape': obj.shape}
with open(obj.meta_path, 'wb') as f:
pickle.dump(meta_json, f)
return obj
def __array_finalize__(self, obj):
if obj is None: return
self.name = getattr(obj, 'name', None)
self.path = getattr(obj, 'path', None)
self.folder = getattr(obj, 'folder', None)
self.meta_path = getattr(obj, 'meta_path', None)
def __reduce__(self):
object_state = list(super().__reduce__())
object_state[2] = (tuple(object_state[2]), self.__dict__)
return tuple(object_state)
def __setstate__(self, state):
nd_state, own_state = state
self.__dict__.update(own_state)
super().__setstate__(nd_state)
The problem lies in super().__reduce__()
- it expects a byte string buffer in the last item, and I couldn't get around that. I tried updating the buffer to a single byte length, and the shape to 1, loading a memory map in __setstate__
and replacing the original buffer back with the memory mapped one, but np.ndarray
expects a byte string.