Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F63737818
intbitset.pyx
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, May 22, 04:29
Size
20 KB
Mime Type
text/x-python
Expires
Fri, May 24, 04:29 (2 d)
Engine
blob
Format
Raw Data
Handle
17812654
Attached To
R3600 invenio-infoscience
intbitset.pyx
View Options
# $Id$
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Defines an intbitset data object to hold unordered sets of unsigned
integers with ultra fast set operations, implemented via bit vectors
and Python C extension to optimize speed and memory usage.
Emulates the Python built-in set class interface with some additional
specific methods such as its own fast dump and load marshalling
functions. Uses real bits to optimize memory usage, so may have
issues with endianness if you transport serialized bitsets between
various machine architectures.
"""
import
zlib
from
array
import
array
ctypedef
unsigned
long
long
int
word_t
ctypedef
unsigned
char
bool_t
ctypedef
unsigned
long
long
int
Py_ssize_t
cdef
extern
from
"Python.h"
:
object
PyString_FromStringAndSize
(
char
*
s
,
Py_ssize_t
len
)
int
PyObject_AsReadBuffer
(
object
obj
,
void
**
buf
,
Py_ssize_t
*
buf_len
)
cdef
extern
from
"intbitset.h"
:
ctypedef
struct
IntBitSet
:
int
size
int
allocated
word_t
trailing_bits
int
tot
word_t
*
bitset
int
wordbytesize
int
wordbitsize
IntBitSet
*
intBitSetCreate
(
int
size
,
bool_t
trailing_bits
)
IntBitSet
*
intBitSetCreateFromBuffer
(
void
*
buf
,
int
bufsize
)
IntBitSet
*
intBitSetResetFromBuffer
(
IntBitSet
*
bitset
,
void
*
buf
,
int
bufsize
)
IntBitSet
*
intBitSetReset
(
IntBitSet
*
bitset
)
void
intBitSetDestroy
(
IntBitSet
*
bitset
)
IntBitSet
*
intBitSetClone
(
IntBitSet
*
bitset
)
int
intBitSetGetSize
(
IntBitSet
*
bitset
)
int
intBitSetGetAllocated
(
IntBitSet
*
bitset
)
int
intBitSetGetTot
(
IntBitSet
*
bitset
)
bool_t
intBitSetIsInElem
(
IntBitSet
*
bitset
,
int
elem
)
void
intBitSetAddElem
(
IntBitSet
*
bitset
,
int
elem
)
void
intBitSetDelElem
(
IntBitSet
*
bitset
,
int
elem
)
bool_t
intBitSetEmpty
(
IntBitSet
*
bitset
)
IntBitSet
*
intBitSetUnion
(
IntBitSet
*
x
,
IntBitSet
*
y
)
IntBitSet
*
intBitSetIntersection
(
IntBitSet
*
x
,
IntBitSet
*
y
)
IntBitSet
*
intBitSetSub
(
IntBitSet
*
x
,
IntBitSet
*
y
)
IntBitSet
*
intBitSetXor
(
IntBitSet
*
x
,
IntBitSet
*
y
)
IntBitSet
*
intBitSetIUnion
(
IntBitSet
*
dst
,
IntBitSet
*
src
)
IntBitSet
*
intBitSetIIntersection
(
IntBitSet
*
dst
,
IntBitSet
*
src
)
IntBitSet
*
intBitSetISub
(
IntBitSet
*
x
,
IntBitSet
*
y
)
IntBitSet
*
intBitSetIXor
(
IntBitSet
*
x
,
IntBitSet
*
y
)
int
intBitSetGetNext
(
IntBitSet
*
x
,
int
last
)
unsigned
char
intBitSetCmp
(
IntBitSet
*
x
,
IntBitSet
*
y
)
cdef
class
intbitset
:
"""
Defines an intbitset data object to hold unordered sets of
unsigned integers with ultra fast set operations, implemented via
bit vectors and Python C extension to optimize speed and memory
usage.
Emulates the Python built-in set class interface with some
additional specific methods such as its own fast dump and load
marshalling functions. Uses real bits to optimize memory usage,
so may have issues with endianness if you transport serialized
bitsets between various machine architectures.
"""
cdef
IntBitSet
*
bitset
def
__new__
(
self
,
rhs
=
0
,
int
minsize
=-
1
,
object
trailing_bits
=
0
):
"""
Initialize intbitset.
* rhs can be:
- int/long for creating allocating empty intbitset that will hold at least
rhs elements, before being resized
- intbitset for cloning
- str for retrieving an intbitset that was dumped into a string
- array for retrieving an intbitset that was dumped into a string stored
in an array
- sequence made of integers for copying all the elements from the
sequence. If minsize is specified than it is initially allocated
enough space to hold up to minsize integers, otherwise the biggest
element of the sequence will be used.
* minsize is a suggested initial upper bound on the numbers that will be
stored, by looking at rhs a sequence of number.
* trailing_bits is 1, then the set will contain "all" the positive integers
after the biggest one added with rhs.
"""
cdef
Py_ssize_t
size
cdef
void
*
buf
cdef
int
elem
cdef
int
i
cdef
int
last
cdef
int
remelem
msg
=
"Error"
self
.
bitset
=
NULL
if
type
(
rhs
)
in
(
int
,
long
):
if
rhs
<
0
:
raise
ValueError
,
"rhs can't be negative"
self
.
bitset
=
intBitSetCreate
(
rhs
,
trailing_bits
)
elif
type
(
rhs
)
is
intbitset
:
self
.
bitset
=
intBitSetClone
((<
intbitset
>
rhs
)
.
bitset
)
elif
type
(
rhs
)
in
(
str
,
array
):
try
:
if
type
(
rhs
)
is
array
:
rhs
=
rhs
.
tostring
()
tmp
=
zlib
.
decompress
(
rhs
)
if
PyObject_AsReadBuffer
(
tmp
,
&
buf
,
&
size
)
<
0
:
raise
Exception
,
"Buffer error!!!"
self
.
bitset
=
intBitSetCreateFromBuffer
(
buf
,
size
)
except
Exception
,
msg
:
raise
ValueError
,
"rhs is corrupted:
%s
"
%
msg
elif
hasattr
(
rhs
,
'__iter__'
):
try
:
if
minsize
>
-
1
:
self
.
bitset
=
intBitSetCreate
(
minsize
,
trailing_bits
)
else
:
if
rhs
:
self
.
bitset
=
intBitSetCreate
(
int
(
max
(
rhs
)),
trailing_bits
)
else
:
self
.
bitset
=
intBitSetCreate
(
0
,
trailing_bits
)
if
trailing_bits
:
last
=
0
for
elem
in
rhs
:
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
for
remelem
from
last
<=
remelem
<
elem
:
intBitSetDelElem
(
self
.
bitset
,
remelem
)
last
=
elem
+
1
else
:
for
elem
in
rhs
:
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetAddElem
(
self
.
bitset
,
elem
)
except
Exception
,
msg
:
raise
ValueError
,
"retrieving integers from rhs is impossible:
%s
"
\
%
msg
else
:
raise
TypeError
,
"rhs is of unknown type
%s
"
%
type
(
rhs
)
def
__dealloc__
(
self
):
if
self
.
bitset
:
intBitSetDestroy
(
self
.
bitset
)
def
__contains__
(
self
,
int
elem
):
if
elem
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
return
intBitSetIsInElem
(
self
.
bitset
,
elem
)
!=
0
def
__cmp__
(
self
,
intbitset
rhs
not
None
):
raise
TypeError
,
"cannot compare intbitset using cmp()"
def
__richcmp__
(
self
,
intbitset
rhs
not
None
,
int
op
):
cdef
short
unsigned
int
tmp
tmp
=
intBitSetCmp
((<
intbitset
>
self
)
.
bitset
,
rhs
.
bitset
)
if
op
==
0
:
# <
return
tmp
==
1
if
op
==
1
:
# <=
return
tmp
<=
1
if
op
==
2
:
# ==
return
tmp
==
0
if
op
==
3
:
# !=
return
tmp
>
0
if
op
==
4
:
# >
return
tmp
==
2
if
op
==
5
:
# >=
return
tmp
in
(
0
,
2
)
def
__len__
(
self
):
return
intBitSetGetTot
(
self
.
bitset
)
def
__hash__
(
self
):
return
hash
(
PyString_FromStringAndSize
(
<
char
*>
self
.
bitset
.
bitset
,
wordbytesize
*
(
intBitSetGetTot
(
self
.
bitset
)
/
wordbitsize
+
1
)))
def
__nonzero__
(
self
):
return
not
intBitSetEmpty
(
self
.
bitset
)
def
__iadd__
(
self
,
rhs
):
cdef
int
elem
if
isinstance
(
rhs
,
(
int
,
long
)):
if
int
(
rhs
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetAddElem
(
self
.
bitset
,
rhs
)
elif
isinstance
(
rhs
,
intbitset
):
intBitSetIUnion
(
self
.
bitset
,
(<
intbitset
>
rhs
)
.
bitset
)
else
:
for
elem
in
rhs
:
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetAddElem
(
self
.
bitset
,
elem
)
return
self
def
__isub__
(
self
,
rhs
):
cdef
int
elem
if
isinstance
(
rhs
,
(
int
,
long
)):
if
int
(
rhs
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetDelElem
(
self
.
bitset
,
rhs
)
elif
isinstance
(
rhs
,
intbitset
):
intBitSetISub
(
self
.
bitset
,
(<
intbitset
>
rhs
)
.
bitset
)
else
:
for
elem
in
rhs
:
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetDelElem
(
self
.
bitset
,
elem
)
return
self
def
__deepcopy__
(
self
,
memo
):
return
intbitset
(
self
)
def
__del__
(
self
,
int
elem
):
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetDelElem
(
self
.
bitset
,
elem
)
def
__and__
(
self
,
intbitset
rhs
not
None
):
ret
=
intbitset
()
intBitSetDestroy
((<
intbitset
>
ret
)
.
bitset
)
(<
intbitset
>
ret
)
.
bitset
=
intBitSetIntersection
((<
intbitset
>
self
)
.
bitset
,
rhs
.
bitset
)
return
ret
def
__or__
(
self
,
intbitset
rhs
not
None
):
ret
=
intbitset
()
intBitSetDestroy
((<
intbitset
>
ret
)
.
bitset
)
(<
intbitset
>
ret
)
.
bitset
=
intBitSetUnion
((<
intbitset
>
self
)
.
bitset
,
rhs
.
bitset
)
return
ret
def
__xor__
(
self
,
intbitset
rhs
not
None
):
ret
=
intbitset
()
intBitSetDestroy
((<
intbitset
>
ret
)
.
bitset
)
(<
intbitset
>
ret
)
.
bitset
=
intBitSetXor
((<
intbitset
>
self
)
.
bitset
,
rhs
.
bitset
)
return
ret
def
__sub__
(
self
,
intbitset
rhs
not
None
):
ret
=
intbitset
()
intBitSetDestroy
((<
intbitset
>
ret
)
.
bitset
)
(<
intbitset
>
ret
)
.
bitset
=
intBitSetSub
((<
intbitset
>
self
)
.
bitset
,
rhs
.
bitset
)
return
ret
def
__iand__
(
self
,
intbitset
rhs
not
None
):
intBitSetIIntersection
(
self
.
bitset
,
rhs
.
bitset
)
return
self
def
__ior__
(
self
,
intbitset
rhs
not
None
):
intBitSetIUnion
(
self
.
bitset
,
rhs
.
bitset
)
return
self
def
__ixor__
(
self
,
intbitset
rhs
not
None
):
intBitSetIXor
(
self
.
bitset
,
rhs
.
bitset
)
return
self
def
__repr__
(
self
):
cdef
int
last
cdef
int
maxelem
if
self
.
bitset
.
trailing_bits
:
maxelem
=
(
intBitSetGetSize
(
self
.
bitset
))
*
wordbitsize
ret
=
"intbitset(["
last
=
-
1
while
last
<
maxelem
:
last
=
intBitSetGetNext
(
self
.
bitset
,
last
)
ret
=
ret
+
'
%i
, '
%
last
if
ret
.
endswith
(
", "
):
ret
=
ret
[:
-
2
]
ret
=
ret
+
'], trailing_bits=True)'
return
ret
else
:
ret
=
"intbitset(["
last
=
-
1
while
last
>=
-
1
:
last
=
intBitSetGetNext
(
self
.
bitset
,
last
)
ret
=
ret
+
'
%i
, '
%
last
ret
=
ret
[:
-
len
(
'-2, '
)]
if
ret
.
endswith
(
', '
):
ret
=
ret
[:
-
2
]
ret
=
ret
+
'])'
return
ret
def
__str__
(
self
):
cdef
int
tot
tot
=
intBitSetGetTot
(
self
.
bitset
)
if
tot
<
0
:
begin_list
=
self
.
to_sorted_list
(
0
,
10
)
ret
=
"intbitset(["
for
n
in
begin_list
:
ret
=
ret
+
'
%i
, '
%
n
ret
=
ret
+
"...])"
return
ret
elif
tot
>
10
:
begin_list
=
self
.
to_sorted_list
(
0
,
5
)
end_list
=
self
.
to_sorted_list
(
tot
-
5
,
tot
)
ret
=
"intbitset(["
for
n
in
begin_list
:
ret
=
ret
+
'
%i
, '
%
n
ret
=
ret
+
"..., "
for
n
in
end_list
:
ret
=
ret
+
'
%i
, '
%
n
ret
=
ret
[:
-
2
]
ret
=
ret
+
'])'
return
ret
else
:
return
self
.
__repr__
()
## Buffer interface
#def __getreadbuffer__(self, int i, void **p):
#if i != 0:
#return -1
#p[0] = (<intbitset >self).bitset
#return (<intbitset >self).size * wordbytesize
#def __getwritebuffer__(self, int i, void **p):
#if i != 0:
#raise SystemError
#p[0] = (<intbitset >self).bitset
#return (<intbitset >self).size * wordbytesize
#def __getsegcount__(self, int *p):
#if p != NULL:
#p[0] = (<intbitset >self).size * wordbytesize
#return 1
#def __getcharbuffer__(self, int i, char **p):
#if i != 0:
#return -1
#p[0] = <char *> (<intbitset >self).bitset
#return (<intbitset >self).size * wordbytesize
# Iterator interface
def
__iter__
(
self
):
if
self
.
bitset
.
trailing_bits
:
raise
OverflowError
,
"It's impossible to iterate over an infinite set."
return
intbitset_iterator
(
self
)
# Customized interface
def
add
(
self
,
int
elem
):
"""Add an element to a set.
This has no effect if the element is already present."""
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
intBitSetAddElem
(
self
.
bitset
,
elem
)
def
clear
(
self
):
intBitSetReset
(
self
.
bitset
)
def
difference
(
intbitset
self
,
intbitset
rhs
):
"""Return the difference of two intbitsets as a new set.
(i.e. all elements that are in this intbitset but not the other.)
"""
return
self
.
__sub__
(
rhs
)
def
difference_update
(
self
,
intbitset
rhs
):
"""Remove all elements of another set from this set."""
self
.
__isub__
(
rhs
)
def
discard
(
self
,
int
elem
):
"""Remove an element from a intbitset if it is a member.
If the element is not a member, do nothing."""
intBitSetDelElem
(
self
.
bitset
,
elem
)
def
intersection
(
self
,
intbitset
rhs
):
"""Return the intersection of two intbitsets as a new set.
(i.e. all elements that are in both intbitsets.)
"""
return
self
.
__and__
(
rhs
)
def
intersection_update
(
self
,
intbitset
rhs
):
"""Update a intbitset with the intersection of itself and another."""
self
.
__iand__
(
rhs
)
def
union
(
self
,
intbitset
rhs
):
"""Return the union of two intbitsets as a new set.
(i.e. all elements that are in either intbitsets.)
"""
return
self
.
__or__
(
rhs
)
def
union_update
(
self
,
intbitset
rhs
):
"""Update a intbitset with the union of itself and another."""
self
.
__ior__
(
rhs
)
def
issubset
(
self
,
intbitset
rhs
):
"""Report whether another set contains this set."""
return
self
.
__le__
(
rhs
)
def
issuperset
(
self
,
intbitset
rhs
):
"""Report whether this set contains another set."""
return
self
.
__ge__
(
rhs
)
def
symmetric_difference
(
self
,
intbitset
rhs
):
"""Return the symmetric difference of two sets as a new set.
(i.e. all elements that are in exactly one of the sets.)
"""
return
self
.
__xor__
(
rhs
)
def
symmetric_difference_update
(
self
,
intbitset
rhs
):
"""Update an intbitset with the symmetric difference of itself and another.
"""
self
.
__ixor__
(
rhs
)
# Dumping & Loading
def
fastdump
(
self
):
"""Return a compressed string representation suitable to be saved
somewhere."""
cdef
Py_ssize_t
size
size
=
intBitSetGetSize
((<
intbitset
>
self
)
.
bitset
)
tmp
=
PyString_FromStringAndSize
(
<
char
*>
self
.
bitset
.
bitset
,
(
size
+
1
)
*
wordbytesize
)
tmp2
=
zlib
.
compress
(
tmp
)
return
tmp2
def
fastload
(
self
,
object
strdump
):
"""Load a compressed string representation produced by a previous call
to the fastdump method into the current intbitset. The previous content
will be replaced."""
cdef
Py_ssize_t
size
cdef
void
*
buf
buf
=
NULL
size
=
0
try
:
if
type
(
strdump
)
is
array
:
strdump
=
strdump
.
tostring
()
# tmp needed to not be garbage collected
tmp
=
zlib
.
decompress
(
strdump
)
if
PyObject_AsReadBuffer
(
tmp
,
&
buf
,
&
size
)
<
0
:
raise
Exception
intBitSetResetFromBuffer
((<
intbitset
>
self
)
.
bitset
,
buf
,
size
)
except
:
raise
ValueError
,
"strdump is corrupted"
return
self
def
copy
(
self
):
"""Return a shallow copy of a set."""
return
intbitset
(
self
)
def
pop
(
self
):
"""Remove and return an arbitrary set element."""
cdef
int
ret
ret
=
intBitSetGetNext
(
self
.
bitset
,
-
1
)
if
ret
<
0
:
raise
KeyError
,
"pop from an empty intbitset"
intBitSetDelElem
(
self
.
bitset
,
ret
)
return
ret
def
remove
(
self
,
int
elem
):
"""Remove an element from a set; it must be a member.
If the element is not a member, raise a KeyError.
"""
if
int
(
elem
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
if
intBitSetIsInElem
(
self
.
bitset
,
elem
):
intBitSetDelElem
(
self
.
bitset
,
elem
)
else
:
raise
KeyError
,
elem
def
strbits
(
self
):
"""Return a string of 0s and 1s representing the content in memory
of the intbitset.
"""
cdef
int
i
cdef
int
last
if
(<
intbitset
>
self
)
.
bitset
.
trailing_bits
:
raise
OverflowError
,
"It's impossible to print an infinite set."
last
=
0
ret
=
''
for
i
in
self
:
ret
=
ret
+
'0'
*
(
i
-
last
)
+
'1'
last
=
i
+
1
return
ret
def
update_with_signs
(
self
,
rhs
):
"""Given a dictionary rhs whose keys are integers, remove all the integers
whose value are less than 0 and add every integer whose value is 0 or more"""
cdef
int
value
try
:
for
value
,
sign
in
rhs
.
items
():
if
int
(
value
)
<
0
:
raise
ValueError
,
"Negative numbers, not allowed"
if
sign
<
0
:
intBitSetDelElem
(
self
.
bitset
,
value
)
else
:
intBitSetAddElem
(
self
.
bitset
,
value
)
except
AttributeError
:
raise
TypeError
,
"rhs should be a valid dictionary with integers keys and integer values"
def
get_size
(
self
):
return
intBitSetGetSize
(
self
.
bitset
)
def
get_allocated
(
self
):
return
intBitSetGetAllocated
(
self
.
bitset
)
def
get_sorted_element
(
self
,
int
index
):
"""Return element at position index in the sorted representation of the
set. Note that index must be less than len(self)"""
cdef
int
l
cdef
int
last
cdef
int
i
l
=
intBitSetGetTot
(
self
.
bitset
)
if
index
<
0
:
if
self
.
bitset
.
trailing_bits
:
raise
OverflowError
,
"It's impossible to retrieve a negative item from an infinite set."
index
=
index
+
l
if
0
<=
index
<
l
:
last
=
intBitSetGetNext
(
self
.
bitset
,
-
1
)
for
i
from
0
<=
i
<
index
:
last
=
intBitSetGetNext
(
self
.
bitset
,
last
)
else
:
raise
IndexError
,
"intbitset index out of range"
return
last
def
to_sorted_list
(
self
,
int
i
,
int
j
):
"""Return a sublist of the sorted representation of the set.
Note, negative indices are not supported."""
cdef
int
l
cdef
int
last
cdef
int
cnt
if
self
.
bitset
.
trailing_bits
:
raise
OverflowError
,
"It's impossible to retrieve a list from an infinite set."
l
=
intBitSetGetTot
(
self
.
bitset
)
if
i
==
0
and
j
==
-
1
:
return
intbitset
(
self
)
ret
=
intbitset
()
if
i
<
0
:
i
=
i
+
l
if
j
<
0
:
j
=
j
+
l
if
i
>=
l
:
i
=
l
if
j
>=
l
:
j
=
l
last
=
-
1
for
cnt
from
0
<=
cnt
<
i
:
last
=
intBitSetGetNext
(
self
.
bitset
,
last
)
for
cnt
from
i
<=
cnt
<
j
:
last
=
intBitSetGetNext
(
self
.
bitset
,
last
)
intBitSetAddElem
((<
intbitset
>
ret
)
.
bitset
,
last
)
return
ret
cdef
class
intbitset_iterator
:
cdef
int
last
cdef
IntBitSet
*
bitset
def
__new__
(
self
,
intbitset
bitset
not
None
):
self
.
last
=
-
1
self
.
bitset
=
bitset
.
bitset
def
__next__
(
self
):
self
.
last
=
intBitSetGetNext
((
<
intbitset_iterator
>
self
)
.
bitset
,
self
.
last
)
if
self
.
last
<
0
:
self
.
last
=
-
2
raise
StopIteration
return
self
.
last
def
__iter__
(
self
):
return
self
Event Timeline
Log In to Comment