Page MenuHomec4science

intbitset.pyx
No OneTemporary

File Metadata

Created
Tue, May 21, 18:14

intbitset.pyx

# $Id$
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Defines an intbitset data object to hold unordered sets of unsigned
integers with ultra fast set operations, implemented via bit vectors
and Python C extension to optimize speed and memory usage.
Emulates the Python built-in set class interface with some additional
specific methods such as its own fast dump and load marshalling
functions. Uses real bits to optimize memory usage, so may have
issues with endianness if you transport serialized bitsets between
various machine architectures.
"""
import zlib
from array import array
ctypedef unsigned long long int word_t
ctypedef unsigned int size_t
ctypedef unsigned short int bool_t
ctypedef unsigned long int Py_ssize_t
cdef extern from "Python.h":
object PyBuffer_FromMemory(void *ptr, int size)
object PyString_FromStringAndSize(char *s, int len)
int PyObject_AsReadBuffer(object obj, void **buf, size_t *buf_len)
object PySequence_Fast(object o, char *m)
object PySequence_Fast_GET_ITEM(object o, Py_ssize_t i)
Py_ssize_t PySequence_Fast_GET_SIZE(object o)
cdef extern from "intbitset.h":
ctypedef struct IntBitSet:
size_t size
int tot
word_t *bitset
size_t wordbytesize
size_t wordbitsize
IntBitSet *intBitSetCreate(size_t size)
IntBitSet *intBitSetCreateFull(size_t size)
IntBitSet *intBitSetCreateFromBuffer(void *buf, size_t bufsize)
IntBitSet *intBitSetResetFromBuffer(IntBitSet *bitset, void *buf, size_t bufsize)
IntBitSet *intBitSetReset(IntBitSet *bitset)
void intBitSetDestroy(IntBitSet *bitset)
IntBitSet *intBitSetClone(IntBitSet *bitset)
size_t intBitSetGetSize(IntBitSet *bitset)
size_t intBitSetGetTot(IntBitSet * bitset)
bool_t intBitSetIsInElem(IntBitSet *bitset, size_t elem)
void intBitSetAddElem(IntBitSet *bitset, size_t elem)
void intBitSetDelElem(IntBitSet *bitset, size_t elem)
bool_t intBitSetEmpty(IntBitSet *bitset)
IntBitSet *intBitSetUnion(IntBitSet *x, IntBitSet *y)
IntBitSet *intBitSetIntersection(IntBitSet *x, IntBitSet *y)
IntBitSet *intBitSetSub(IntBitSet *x, IntBitSet *y)
IntBitSet *intBitSetXor(IntBitSet *x, IntBitSet *y)
IntBitSet *intBitSetIUnion(IntBitSet *dst, IntBitSet *src)
IntBitSet *intBitSetIIntersection(IntBitSet *dst, IntBitSet *src)
IntBitSet *intBitSetISub(IntBitSet *x, IntBitSet *y)
IntBitSet *intBitSetIXor(IntBitSet *x, IntBitSet *y)
int intBitSetGetNext(IntBitSet *x, int last)
unsigned char intBitSetCmp(IntBitSet *x, IntBitSet *y)
cdef class intbitset:
"""
Defines an intbitset data object to hold unordered sets of
unsigned integers with ultra fast set operations, implemented via
bit vectors and Python C extension to optimize speed and memory
usage.
Emulates the Python built-in set class interface with some
additional specific methods such as its own fast dump and load
marshalling functions. Uses real bits to optimize memory usage,
so may have issues with endianness if you transport serialized
bitsets between various machine architectures.
"""
cdef IntBitSet *bitset
#def __init__(intbitset self, rhs=0, int minsize=-1):
#if not self.bitset:
#raise ValueError, "Impossible to create intbitset"
def __new__(intbitset self, rhs=0, int minsize=-1):
"""Initialize intbitset. rhs can be:
int/long for creating allocating empty intbitset that will hold at least
rhs elements, before being resized
intbitset for cloning
str for retrieving an intbitset that was dumped into a string
array for retrieving an intbitset that was dumpeg into a string stored
in an array
a sequence made of integers for copying all the elements from the
sequence. If minsize is specified than it is initially allocated
enough space to hold up to minsize integers, otherwise the biggest
element of the sequence will be used.
"""
cdef size_t size
cdef void *buf
cdef size_t elem
cdef char *msg
cdef size_t i
msg = "Error"
self.bitset = NULL
if type(rhs) in (int, long):
if rhs < 0:
raise ValueError, "rhs can't be negative"
self.bitset = intBitSetCreate(rhs)
elif type(rhs) is intbitset:
self.bitset = intBitSetClone((<intbitset>rhs).bitset)
elif type(rhs) is str:
try:
tmp = zlib.decompress(rhs)
PyObject_AsReadBuffer(tmp, &buf, &size)
self.bitset = intBitSetCreateFromBuffer(buf, size)
except:
raise ValueError, "rhs is corrupted"
elif type(rhs) is array:
try:
tmp = zlib.decompress(rhs.tostring())
PyObject_AsReadBuffer(tmp, &buf, &size)
self.bitset = intBitSetCreateFromBuffer(buf, size)
except:
raise ValueError, "rhs is corrupted"
elif hasattr(rhs, '__iter__'):
try:
if minsize > -1:
self.bitset = intBitSetCreate(minsize)
else:
if rhs:
self.bitset = intBitSetCreate(int(max(rhs)))
else:
self.bitset = intBitSetCreate(0)
for elem in rhs:
intBitSetAddElem(self.bitset, elem)
except Exception, msg:
raise ValueError, "retrieving integers from rhs is impossible :%s" \
% msg
else:
raise TypeError, "rhs is of unknown type %s" % type(rhs)
def __dealloc__(intbitset self):
if self.bitset:
intBitSetDestroy(self.bitset)
def __contains__(intbitset self, unsigned int elem):
return intBitSetIsInElem(self.bitset, elem) != 0
def __cmp__(intbitset self, intbitset rhs):
raise TypeError, "cannot compare intbitset using cmp()"
def __richcmp__(intbitset self, intbitset rhs, int op):
cdef short unsigned int tmp
tmp = intBitSetCmp(self.bitset, rhs.bitset)
if op == 0: # <
return tmp == 1
if op == 1: # <=
return tmp <= 1
if op == 2: # ==
return tmp == 0
if op == 3: # !=
return tmp > 0
if op == 4: # >
return tmp == 2
if op == 5: # >=
return tmp in (0, 2)
def __len__(intbitset self):
return intBitSetGetTot(self.bitset)
def __hash__(intbitset self):
return hash(PyString_FromStringAndSize(<char *>self.bitset.bitset, wordbytesize * (intBitSetGetTot(self.bitset) / wordbitsize + 1)))
def __nonzero__(intbitset self):
return not intBitSetEmpty(self.bitset)
def __iadd__(intbitset self, rhs):
cdef size_t elem
if isinstance(rhs, (int, long)):
intBitSetAddElem(self.bitset, rhs)
elif isinstance(rhs, intbitset):
intBitSetIUnion(self.bitset, (<intbitset> rhs).bitset)
else:
for elem in rhs:
intBitSetAddElem(self.bitset, elem)
return self
def __isub__(intbitset self, rhs):
cdef size_t elem
if isinstance(rhs, (int, long)):
intBitSetDelElem(self.bitset, rhs)
elif isinstance(rhs, intbitset):
intBitSetISub(self.bitset, (<intbitset> rhs).bitset)
else:
for elem in rhs:
intBitSetDelElem(self.bitset, elem)
return self
def __deepcopy__(intbitset self, memo):
return intbitset(self)
def __del__(intbitset self, size_t elem):
intBitSetDelElem(self.bitset, elem)
def __and__(intbitset self, intbitset rhs):
ret = intbitset()
intBitSetDestroy((<intbitset>ret).bitset)
(<intbitset>ret).bitset = intBitSetIntersection(self.bitset, rhs.bitset)
return ret
def __or__(intbitset self, intbitset rhs):
ret = intbitset()
intBitSetDestroy((<intbitset>ret).bitset)
(<intbitset>ret).bitset = intBitSetUnion(self.bitset, rhs.bitset)
return ret
def __xor__(intbitset self, intbitset rhs):
ret = intbitset()
intBitSetDestroy((<intbitset>ret).bitset)
(<intbitset>ret).bitset = intBitSetXor(self.bitset, rhs.bitset)
return ret
def __sub__(intbitset self, intbitset rhs):
ret = intbitset()
intBitSetDestroy((<intbitset>ret).bitset)
(<intbitset>ret).bitset = intBitSetSub(self.bitset, rhs.bitset)
return ret
def __iand__(intbitset self, intbitset rhs):
intBitSetIIntersection(self.bitset, rhs.bitset)
return self
def __ior__(intbitset self, intbitset rhs):
intBitSetIUnion(self.bitset, rhs.bitset)
return self
def __ixor__(intbitset self, intbitset rhs):
intBitSetIXor(self.bitset, rhs.bitset)
return self
def __repr__(intbitset self):
ret = "intbitset(["
last = -1
while last >= -1:
last = intBitSetGetNext(self.bitset, last)
ret = ret + '%i, ' % last
ret = ret[:-len('-2, ')]
if ret.endswith(', '):
ret = ret[:-2]
ret = ret + '])'
return ret
def __iter__(intbitset self):
return intbitset_iterator(self)
def add(intbitset self, size_t elem):
"""Add an element to a set.
This has no effect if the element is already present."""
intBitSetAddElem(self.bitset, elem)
def clear(intbitset self):
intBitSetReset(self.bitset)
def difference(intbitset self, intbitset rhs):
"""Return the difference of two intbitsets as a new set.
(i.e. all elements that are in this intbitset but not the other.)
"""
return self.__sub__(rhs)
def difference_update(intbitset self, intbitset rhs):
"""Remove all elements of another set from this set."""
self.__isub__(rhs)
def discard(intbitset self, size_t elem):
"""Remove an element from a intbitset if it is a member.
If the element is not a member, do nothing."""
intBitSetDelElem(self.bitset, elem)
def intersection(intbitset self, intbitset rhs):
"""Return the intersection of two intbitsets as a new set.
(i.e. all elements that are in both intbitsets.)
"""
return self.__and__(rhs)
def intersection_update(intbitset self, intbitset rhs):
"""Update a intbitset with the intersection of itself and another."""
self.__iand__(rhs)
def union(intbitset self, intbitset rhs):
"""Return the union of two intbitsets as a new set.
(i.e. all elements that are in either intbitsets.)
"""
return self.__or__(rhs)
def union_update(intbitset self, intbitset rhs):
"""Update a intbitset with the union of itself and another."""
self.__ior__(rhs)
def issubset(intbitset self, intbitset rhs):
"""Report whether another set contains this set."""
return self.__le__(rhs)
def issuperset(intbitset self, intbitset rhs):
"""Report whether this set contains another set."""
return self.__ge__(rhs)
def symmetric_difference(intbitset self, intbitset rhs):
"""Return the symmetric difference of two sets as a new set.
(i.e. all elements that are in exactly one of the sets.)
"""
return self.__xor__(rhs)
def symmetric_difference_update(intbitset self, intbitset rhs):
"""Update an intbitset with the symmetric difference of itself and another.
"""
self.__ixor__(rhs)
def fastdump(intbitset self):
"""Return a compressed string representation suitable to be saved
somewhere."""
return zlib.compress(PyString_FromStringAndSize(<char *>self.bitset.bitset, self.bitset.size * wordbytesize))
def copy(intbitset self):
"""Return a shallow copy of a set."""
return intbitset(self)
def pop(intbitset self):
"""Remove and return an arbitrary set element."""
cdef int ret
ret = intBitSetGetNext(self.bitset, -1)
if ret < 0:
raise KeyError, "pop from an empty intbitset"
intBitSetDelElem(self.bitset, ret)
return ret
def remove(intbitset self, size_t elem):
"""Remove an element from a set; it must be a member.
If the element is not a member, raise a KeyError.
"""
if intBitSetIsInElem(self.bitset, elem):
intBitSetDelElem(self.bitset, elem)
else:
raise KeyError, elem
def fastload(intbitset self, strdump):
"""Return a compressed string representation suitable to be saved
somewhere."""
cdef size_t size
cdef void *buf
if type(strdump) is str:
try:
tmp = zlib.decompress(strdump)
PyObject_AsReadBuffer(tmp, &buf, &size)
intBitSetResetFromBuffer((<intbitset> self).bitset, buf, size)
except:
raise ValueError, "rhs is corrupted"
elif type(strdump) is array:
try:
tmp = zlib.decompress(strdump.tostring())
PyObject_AsReadBuffer(tmp, &buf, &size)
intBitSetResetFromBuffer((<intbitset> self).bitset, buf, size)
except:
raise ValueError, "rhs is corrupted"
return self
def strbits(intbitset self):
cdef size_t i
cdef size_t last
last = 0
ret = ''
for i in self:
ret = ret + '0'*(i-last)+'1'
last = i+1
return ret
def update_with_signs(intbitset self, rhs):
"""Given a dictionary rhs whose keys are integers, remove all the integers
whose value are less than 0 and add every integer whose value is 0 or more"""
cdef size_t value
try:
for value, sign in rhs.items():
if sign < 0:
intBitSetDelElem(self.bitset, value)
else:
intBitSetAddElem(self.bitset, value)
except AttributeError:
raise TypeError, "rhs should be a valid dictionary with integers keys and integer values"
def get_sorted_element(intbitset self, int index):
"""Return element at position index in the sorted representation of the
set. Note that index must be less than len(self)"""
cdef size_t l
cdef int last
cdef size_t i
l = intBitSetGetTot(self.bitset)
if index < 0:
index = index + l
if 0 <= index < l:
last = intBitSetGetNext(self.bitset, -1)
for i from 0 <= i < index:
last = intBitSetGetNext(self.bitset, last)
else:
raise IndexError, "intbitset index out of range"
return last
def to_sorted_list(intbitset self, int i, int j):
"""Return a sublist of the sorted representation of the set.
Note, negative indices are not supported."""
cdef size_t l
cdef int last
cdef size_t cnt
l = intBitSetGetTot(self.bitset)
if i == 0 and j == -1:
return intbitset(self)
ret = intbitset()
if i < 0:
i = i + l
if j < 0:
j = j + l
if i >= l:
i = l
if j >= l:
j = l
last = -1
for cnt from 0 <= cnt < i:
last = intBitSetGetNext(self.bitset, last)
for cnt from i <= cnt < j:
last = intBitSetGetNext(self.bitset, last)
intBitSetAddElem((<intbitset> ret).bitset, last)
return ret
def intbitsetfull(size_t size):
"""Build a intbiset containing 0..size-1 elements."""
ret = intbitset()
intBitSetDestroy((<intbitset> ret).bitset)
(<intbitset> ret).bitset = intBitSetCreateFull(size)
return ret
cdef class intbitset_iterator:
cdef int last
cdef IntBitSet *bitset
def __new__(intbitset_iterator self, intbitset bitset):
self.last = -1
self.bitset = bitset.bitset
def __next__(intbitset_iterator self):
self.last = intBitSetGetNext((<intbitset_iterator>self).bitset, self.last)
if self.last < 0:
self.last = -2
raise StopIteration
return self.last
def __iter__(intbitset_iterator self):
return self

Event Timeline