Index: src/lucille/_storemodule.c =================================================================== --- src/lucille/_storemodule.c (.../trunk) (revision 0) +++ src/lucille/_storemodule.c (.../branches/native_IndexInput) (revision 115) @@ -0,0 +1,761 @@ + +/* Copyright 2007 Dan Callaghan */ + +/* + * This file is part of Lucille, based on Apache Lucene + * . + * + * Lucille is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License as published by the Free Software + * Foundation; either version 3 of the License, or (at your option) any later + * version. + * + * Lucille is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#include +#include "structmember.h" + + +/* The following struct definition is copied verbatim from + * Modules/mmapmodule.c in the Python source. This makes the code here fragile + * (if Python update the mmap module, we have to update this) and means that + * compiled versions of this extension may not be binary compatible across + * Python releases or even compiler versions. + * + * However it does let us get access to the mmap objects guts without any + * overhead of going back through the Python interpreter. + */ + +#ifndef MS_WINDOWS +#define UNIX +#endif + +typedef struct { + PyObject_HEAD + char * data; + size_t size; + size_t pos; + +#ifdef MS_WINDOWS + HANDLE map_handle; + HANDLE file_handle; + char * tagname; +#endif + +#ifdef UNIX + int fd; +#endif + + int access; +} mmap_object; + +#ifdef UNIX +#include +#endif + +/* End code copied from Modules/mmapmodule.c */ + + +static PyObject *threading_module = NULL; +static PyObject *mmap_module = NULL; +static PyObject *zero = NULL; +static PyObject *four = NULL; +static PyObject *eight = NULL; +static PyObject *str_read_bytes = NULL; +static PyObject *str_read_byte = NULL; + + +#define BaseIndexInput_doc "filepos_lock should be acquired by callers around blocks which expect the \n" \ + "file position to remain consistent. " + +#define SELF_READ_BYTE(in, in_obj, error) \ + if ((in_obj = PyObject_CallMethodObjArgs((PyObject *)self, str_read_byte, NULL)) == NULL) \ + goto error; \ + if ((in = PyInt_AsLong(in_obj)) < 0) \ + goto error; \ + Py_DECREF(in_obj) + +typedef struct { + PyObject_HEAD + PyObject *filepos_lock; +} BaseIndexInput; + +static PyMemberDef BaseIndexInput_members[] = { + {"filepos_lock", T_OBJECT_EX, offsetof(BaseIndexInput, filepos_lock), 0, + "for callers, to hold around seek operations"}, + {NULL} /* sentinel */ +}; + +static PyObject *BaseIndexInput_new(PyTypeObject *type, PyObject *args, + PyObject *kwargs) { + BaseIndexInput *self; + + if ((self = (BaseIndexInput *)type->tp_alloc(type, 0)) == NULL) + goto error; + self->filepos_lock = NULL; + return (PyObject *)self; + +error: + return NULL; +} + +static int BaseIndexInput_init(BaseIndexInput *self, PyObject *args, PyObject *kwargs) { + /* accept any args but ignore them */ + /*static char *kwlist[] = {NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Sk", kwlist, + &filename, &self->length)) + goto error;*/ + + if (!(self->filepos_lock = PyObject_CallMethod(threading_module, "RLock", + NULL))) + goto error; + + return 0; + +error: + return -1; +} + +static void BaseIndexInput_dealloc(BaseIndexInput *self) { + Py_XDECREF(self->filepos_lock); + self->ob_type->tp_free((PyObject *)self); +} + +static PyObject *BaseIndexInput_read_chars(BaseIndexInput *self, PyObject *args) { + unsigned long n = -1; + if (!PyArg_ParseTuple(args, "k", &n)) + return NULL; + PyObject *retval = NULL; + if (!(retval = PyUnicode_FromUnicode(NULL, n))) + return NULL; + Py_UNICODE *buff = PyUnicode_AS_UNICODE(retval); + int i; + for (i = 0; i < n; i ++) { + PyObject *in_obj; + long in; + SELF_READ_BYTE(in, in_obj, error); + unsigned char b = (unsigned char) in; + if ((b & 0x80) == 0) { + buff[i] = b; + } else if ((b & 0xE0) != 0xE0) { + SELF_READ_BYTE(in, in_obj, error); + unsigned char b_lo = (unsigned char) in; + buff[i] = (b & 0x1F) << 6 | (b_lo & 0x3F); + } else { + SELF_READ_BYTE(in, in_obj, error); + unsigned char b_mid = (unsigned char) in; + SELF_READ_BYTE(in, in_obj, error); + unsigned char b_lo = (unsigned char) in; + buff[i] = (b & 0x0F) << 12 | (b_mid & 0x3F) << 6 | (b_lo & 0x3F); + } + } + return retval; + +error: + Py_DECREF(retval); + return NULL; +} + +static PyObject *BaseIndexInput_skip_chars(BaseIndexInput *self, PyObject *args) { + unsigned long n = -1; + if (!PyArg_ParseTuple(args, "k", &n)) + return NULL; + int i; + for (i = 0; i < n; i ++) { + PyObject *in_obj; + long in; + SELF_READ_BYTE(in, in_obj, error); + unsigned char b = (unsigned char) in; + if ((b & 0x80) == 0) { + /* pass */ + } else if ((b & 0xE0) != 0xE0) { + SELF_READ_BYTE(in, in_obj, error); + } else { + SELF_READ_BYTE(in, in_obj, error); + SELF_READ_BYTE(in, in_obj, error); + } + } + Py_RETURN_NONE; + +error: + return NULL; +} + +static PyObject *BaseIndexInput_read_int(BaseIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + PyObject *in_obj; + if (!(in_obj = PyObject_CallMethodObjArgs((PyObject *)self, str_read_bytes, + four, NULL))) + return NULL; + + unsigned char *in; + Py_ssize_t in_len; + PyString_AsStringAndSize(in_obj, &in, &in_len); + if (in_len != 4) { + Py_DECREF(in_obj); + return PyErr_Format(PyExc_ValueError, + "read_bytes(4) returned string of length %zd", in_len); + } + + long n = (long) (in[0] << 24 | in[1] << 16 | in[2] << 8 | in[3]); + Py_DECREF(in_obj); + return PyInt_FromLong(n); +} + +static PyObject *BaseIndexInput_read_long(BaseIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + PyObject *in_obj; + if (!(in_obj = PyObject_CallMethodObjArgs((PyObject *)self, str_read_bytes, + eight, NULL))) + return NULL; + + unsigned char *in; + Py_ssize_t in_len; + PyString_AsStringAndSize(in_obj, &in, &in_len); + if (in_len != 8) { + Py_DECREF(in_obj); + return PyErr_Format(PyExc_ValueError, + "read_bytes(8) returned string of length %zd", in_len); + } + + PY_LONG_LONG n = ((PY_LONG_LONG) in[0] << 56 | + (PY_LONG_LONG) in[1] << 48 | + (PY_LONG_LONG) in[2] << 40 | + (PY_LONG_LONG) in[3] << 32 | + (PY_LONG_LONG) in[4] << 24 | + (PY_LONG_LONG) in[5] << 16 | + (PY_LONG_LONG) in[6] << 8 | + in[7]); + return PyLong_FromLongLong(n); +} + +static PyObject *BaseIndexInput_read_vint(BaseIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + PyObject *in_obj; + long in; + unsigned long long val = 0; + unsigned int shift = 0; + do { + SELF_READ_BYTE(in, in_obj, error); + val += ((unsigned char) in & 127) << shift; + shift += 7; + } while ((unsigned char) in & 128); + return PyLong_FromUnsignedLongLong(val); + +error: + return NULL; +} + +static PyObject *BaseIndexInput_read_string(BaseIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + PyObject *retval = NULL; + PyObject *in_obj; + int in; + + /* read_vint */ + unsigned long long n = 0; + unsigned int shift = 0; + do { + SELF_READ_BYTE(in, in_obj, error_no_retval); + n += ((unsigned char) in & 127) << shift; + shift += 7; + } while ((unsigned char) in & 128); + + /* read_chars */ + if (!(retval = PyUnicode_FromUnicode(NULL, n))) + return NULL; + Py_UNICODE *buff = PyUnicode_AS_UNICODE(retval); + int i; + for (i = 0; i < n; i ++) { + SELF_READ_BYTE(in, in_obj, error); + unsigned char b = (unsigned char) in; + if ((b & 0x80) == 0) { + buff[i] = b; + } else if ((b & 0xE0) != 0xE0) { + SELF_READ_BYTE(in, in_obj, error); + unsigned char b_lo = (unsigned char) in; + buff[i] = (b & 0x1F) << 6 | (b_lo & 0x3F); + } else { + SELF_READ_BYTE(in, in_obj, error); + unsigned char b_mid = (unsigned char) in; + SELF_READ_BYTE(in, in_obj, error); + unsigned char b_lo = (unsigned char) in; + buff[i] = (b & 0x0F) << 12 | (b_mid & 0x3F) << 6 | (b_lo & 0x3F); + } + } + return retval; + +error: + Py_DECREF(retval); +error_no_retval: + return NULL; +} + +static PyMethodDef BaseIndexInput_methods[] = { + {"read_chars", (PyCFunction) BaseIndexInput_read_chars, METH_VARARGS, + "Reads the given number of characters as a unicode.\n" + "\n" + "Lucene uses Java's \"modified UTF-8\":\n" + "."}, + {"skip_chars", (PyCFunction) BaseIndexInput_skip_chars, METH_VARARGS, + "Like read_chars, but discards the characters read."}, + {"read_int", (PyCFunction) BaseIndexInput_read_int, METH_NOARGS, + "Reads a signed 32-bit integer value."}, + {"read_long", (PyCFunction) BaseIndexInput_read_long, METH_NOARGS, + "Reads a signed 64-bit integer value."}, + {"read_vint", (PyCFunction) BaseIndexInput_read_vint, METH_NOARGS, + "Reads a Lucene variable-length integer value.\n" + "\n" + "Lucene also defines #readVLong(), but read_vint can be used for both."}, + {"read_string", (PyCFunction) BaseIndexInput_read_string, METH_NOARGS, + "Reads a Lucene variable-length string (unicode) value."}, + {NULL} /* sentinel */ +}; + +static PyTypeObject BaseIndexInputType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "lucille._store.BaseIndexInput",/* tp_name */ + sizeof(BaseIndexInput), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) BaseIndexInput_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + BaseIndexInput_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + BaseIndexInput_methods, /* tp_methods */ + BaseIndexInput_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc) BaseIndexInput_init, /* tp_init */ + 0, /* tp_alloc */ + BaseIndexInput_new, /* tp_new */ +}; + + +#define MMapIndexInput_doc "IndexInput implementation that uses mmap() for accessing files." + +typedef struct { + PyObject_HEAD + PyObject *filepos_lock; + mmap_object *mmap; + Py_ssize_t filepos; + Py_ssize_t offset; + Py_ssize_t length; /* the entire file is always mapped */ +} MMapIndexInput; + +static PyMemberDef MMapIndexInput_members[] = { + {"filepos_lock", T_OBJECT_EX, offsetof(MMapIndexInput, filepos_lock), RO, + "for callers, to hold around seek operations"}, + {"mmap", T_OBJECT_EX, offsetof(MMapIndexInput, mmap), RO, + "underlying mmap object"}, + /* XXX these should be T_PYSSIZET, but that is missing from Python 2.5 */ + {"offset", T_LONGLONG, offsetof(MMapIndexInput, offset), 0, + "file position offset (for compound files)"}, + {"length", T_LONGLONG, offsetof(MMapIndexInput, length), 0, + "valid file length (may be less than real file length, for compound files)"}, + {NULL} /* sentinel */ +}; + +static PyObject *MMapIndexInput_new(PyTypeObject *type, PyObject *args, + PyObject *kwargs) { + MMapIndexInput *self; + + if ((self = (MMapIndexInput *)type->tp_alloc(type, 0)) == NULL) + goto error; + self->filepos_lock = NULL; + self->mmap = NULL; + return (PyObject *)self; + +error: + return NULL; +} + +static int MMapIndexInput_init(MMapIndexInput *self, PyObject *args, PyObject *kwargs) { + PyObject *file; + Py_ssize_t length; + static char *kwlist[] = {"file", "length", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "On", kwlist, + &file, &length)) + goto error; + + if (!(self->filepos_lock = PyObject_CallMethod(threading_module, "RLock", + NULL))) + goto error; + + PyObject *fd_obj; + int fd; + if (!(fd_obj = PyObject_CallMethod(file, "fileno", NULL))) + goto error; + fd = PyInt_AsLong(fd_obj); + Py_DECREF(fd_obj); + if (fd == -1) + goto error; + +#ifdef MS_WINDOWS + PyObject *access_read; + if (!(access_read = PyObject_GetAttrString(mmap_module, "ACCESS_READ"))) + goto error; + self->mmap = (mmap_object *)PyObject_CallMethod(mmap_module, "mmap", "insO", + fd, 0, NULL, access_read); + Py_DECREF(access_read); +#else + self->mmap = (mmap_object *)PyObject_CallMethod(mmap_module, "mmap", "inii", + fd, 0, MAP_SHARED, PROT_READ); +#endif + if (!self->mmap) + goto error; + + self->filepos = 0; + self->offset = 0; + self->length = length; + + return 0; + +error: + return -1; +} + +static PyObject *MMapIndexInput_clone(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + MMapIndexInput *clone = NULL; + + if (!(clone = (MMapIndexInput *)self->ob_type->tp_new(self->ob_type, + NULL, NULL))) + return NULL; + + if (!(clone->filepos_lock = PyObject_CallMethod(threading_module, "RLock", + NULL))) + goto error; + Py_INCREF(self->mmap); + clone->mmap = self->mmap; + clone->filepos = self->filepos; + clone->offset = self->offset; + clone->length = self->length; + + return (PyObject *)clone; + +error: + Py_XDECREF(clone); + return NULL; +} + +static void MMapIndexInput_dealloc(MMapIndexInput *self) { + Py_XDECREF(self->filepos_lock); + Py_XDECREF(self->mmap); + self->ob_type->tp_free((PyObject *)self); +} + +static PyObject *MMapIndexInput_tell(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + return PyInt_FromSsize_t(self->filepos - self->offset); +} + +static PyObject *MMapIndexInput_seek(MMapIndexInput *self, PyObject *args) { + Py_ssize_t pos; + if (!PyArg_ParseTuple(args, "n", &pos)) + return NULL; + if (pos > self->length) { + PyErr_SetString(PyExc_ValueError, "seek beyond end of mmap"); + return NULL; + } + self->filepos = self->offset + pos; + Py_RETURN_NONE; +} + +static inline unsigned char MMapIndexInput__read_byte(MMapIndexInput *self) { + return self->mmap->data[self->filepos ++]; +} + +static inline void MMapIndexInput__read_bytes(MMapIndexInput *self, + unsigned char **buff, Py_ssize_t n) { + unsigned char *retval = (unsigned char *)self->mmap->data + self->filepos; + self->filepos += n; + *buff = retval; +} + +static PyObject *MMapIndexInput_read_byte(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + return Py_BuildValue("B", MMapIndexInput__read_byte(self)); +} + +static PyObject *MMapIndexInput_read_bytes(MMapIndexInput *self, PyObject *args) { + Py_ssize_t n; + if (!PyArg_ParseTuple(args, "n", &n)) + return NULL; + unsigned char *buff; + MMapIndexInput__read_bytes(self, &buff, n); + return Py_BuildValue("s#", buff, n); +} + +static PyObject *MMapIndexInput_read_chars(MMapIndexInput *self, PyObject *args) { + unsigned long n = -1; + if (!PyArg_ParseTuple(args, "k", &n)) + return NULL; + PyObject *retval = NULL; + if (!(retval = PyUnicode_FromUnicode(NULL, n))) + return NULL; + Py_UNICODE *buff = PyUnicode_AS_UNICODE(retval); + int i; + for (i = 0; i < n; i ++) { + unsigned char b = MMapIndexInput__read_byte(self); + if ((b & 0x80) == 0) { + buff[i] = b; + } else if ((b & 0xE0) != 0xE0) { + unsigned char b_lo = MMapIndexInput__read_byte(self); + buff[i] = (b & 0x1F) << 6 | (b_lo & 0x3F); + } else { + unsigned char b_mid = MMapIndexInput__read_byte(self); + unsigned char b_lo = MMapIndexInput__read_byte(self); + buff[i] = (b & 0x0F) << 12 | (b_mid & 0x3F) << 6 | (b_lo & 0x3F); + } + } + return retval; +} + +static PyObject *MMapIndexInput_skip_chars(MMapIndexInput *self, PyObject *args) { + unsigned long n = -1; + if (!PyArg_ParseTuple(args, "k", &n)) + return NULL; + int i; + for (i = 0; i < n; i ++) { + unsigned char b = MMapIndexInput__read_byte(self); + if ((b & 0x80) == 0) { + /* pass */ + } else if ((b & 0xE0) != 0xE0) { + MMapIndexInput__read_byte(self); + } else { + MMapIndexInput__read_byte(self); + MMapIndexInput__read_byte(self); + } + } + Py_RETURN_NONE; +} + +static PyObject *MMapIndexInput_read_int(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + unsigned char *in; + MMapIndexInput__read_bytes(self, &in, 4); + long n = (long) (in[0] << 24 | in[1] << 16 | in[2] << 8 | in[3]); + return PyInt_FromLong(n); +} + +static PyObject *MMapIndexInput_read_long(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + unsigned char *in; + MMapIndexInput__read_bytes(self, &in, 8); + PY_LONG_LONG n = ((PY_LONG_LONG) in[0] << 56 | + (PY_LONG_LONG) in[1] << 48 | + (PY_LONG_LONG) in[2] << 40 | + (PY_LONG_LONG) in[3] << 32 | + (PY_LONG_LONG) in[4] << 24 | + (PY_LONG_LONG) in[5] << 16 | + (PY_LONG_LONG) in[6] << 8 | + in[7]); + return PyLong_FromLongLong(n); +} + +static PyObject *MMapIndexInput_read_vint(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + unsigned char in; + unsigned long long val = 0; + unsigned int shift = 0; + do { + in = MMapIndexInput__read_byte(self); + val += (in & 127) << shift; + shift += 7; + } while (in & 128); + return PyLong_FromUnsignedLongLong(val); +} + +static PyObject *MMapIndexInput_read_string(MMapIndexInput *self, PyObject *_) { + /* METH_NOARGS */ + PyObject *retval = NULL; + + /* read_vint */ + unsigned char in; + unsigned long long n = 0; + unsigned int shift = 0; + do { + in = MMapIndexInput__read_byte(self); + n += (in & 127) << shift; + shift += 7; + } while (in & 128); + + /* read_chars */ + if (!(retval = PyUnicode_FromUnicode(NULL, n))) + return NULL; + Py_UNICODE *buff = PyUnicode_AS_UNICODE(retval); + int i; + for (i = 0; i < n; i ++) { + unsigned char b = MMapIndexInput__read_byte(self); + if ((b & 0x80) == 0) { + buff[i] = b; + } else if ((b & 0xE0) != 0xE0) { + unsigned char b_lo = MMapIndexInput__read_byte(self); + buff[i] = (b & 0x1F) << 6 | (b_lo & 0x3F); + } else { + unsigned char b_mid = MMapIndexInput__read_byte(self); + unsigned char b_lo = MMapIndexInput__read_byte(self); + buff[i] = (b & 0x0F) << 12 | (b_mid & 0x3F) << 6 | (b_lo & 0x3F); + } + } + return retval; +} + +static PyMethodDef MMapIndexInput_methods[] = { + {"clone", (PyCFunction) MMapIndexInput_clone, METH_NOARGS, NULL}, + {"tell", (PyCFunction) MMapIndexInput_tell, METH_NOARGS, NULL}, + {"seek", (PyCFunction) MMapIndexInput_seek, METH_VARARGS, NULL}, + {"read_byte", (PyCFunction) MMapIndexInput_read_byte, METH_NOARGS, NULL}, + {"read_bytes", (PyCFunction) MMapIndexInput_read_bytes, METH_VARARGS, NULL}, + {"read_chars", (PyCFunction) MMapIndexInput_read_chars, METH_VARARGS, NULL}, + {"skip_chars", (PyCFunction) MMapIndexInput_skip_chars, METH_VARARGS, NULL}, + {"read_int", (PyCFunction) MMapIndexInput_read_int, METH_NOARGS, NULL}, + {"read_long", (PyCFunction) MMapIndexInput_read_long, METH_NOARGS, NULL}, + {"read_vint", (PyCFunction) MMapIndexInput_read_vint, METH_NOARGS, NULL}, + {"read_string", (PyCFunction) MMapIndexInput_read_string, METH_NOARGS, NULL}, + {NULL} /* sentinel */ +}; + +static PyTypeObject MMapIndexInputType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "lucille._store.MMapIndexInput",/* tp_name */ + sizeof(MMapIndexInput), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) MMapIndexInput_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + MMapIndexInput_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + MMapIndexInput_methods, /* tp_methods */ + MMapIndexInput_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc) MMapIndexInput_init, /* tp_init */ + 0, /* tp_alloc */ + MMapIndexInput_new, /* tp_new */ +}; + + +static PyMethodDef _store_methods[] = { + {NULL} /* sentinel */ +}; + +PyMODINIT_FUNC init_store(void) { + /* fetch ints */ + if (!(zero = PyInt_FromLong(0))) + goto error; + if (!(four = PyInt_FromLong(4))) + goto error; + if (!(eight = PyInt_FromLong(8))) + goto error; + + /* create strings */ + if (!(str_read_byte = PyString_FromString("read_byte"))) + goto error; + if (!(str_read_bytes = PyString_FromString("read_bytes"))) + goto error; + + /* import threading */ + PyObject *threading_module_name = NULL; + if (!(threading_module_name = PyString_FromString("threading"))) + goto error; + if (!(threading_module = PyImport_Import(threading_module_name))) + goto error; + Py_DECREF(threading_module_name); + threading_module_name = NULL; + + /* import mmap */ + PyObject *mmap_module_name = NULL; + if (!(mmap_module_name = PyString_FromString("mmap"))) + goto error; + if (!(mmap_module = PyImport_Import(mmap_module_name))) + goto error; + Py_DECREF(mmap_module_name); + mmap_module_name = NULL; + + /* initialise this module */ + PyObject *m; + + BaseIndexInputType.tp_new = PyType_GenericNew; + if (PyType_Ready(&BaseIndexInputType) < 0) + goto error; + MMapIndexInputType.tp_new = PyType_GenericNew; + if (PyType_Ready(&MMapIndexInputType) < 0) + goto error; + + m = Py_InitModule3("lucille._store", _store_methods, + "Provides native implementations of some classes in lucille.store."); + + Py_INCREF(&BaseIndexInputType); + PyModule_AddObject(m, "BaseIndexInput", (PyObject *)&BaseIndexInputType); + Py_INCREF(&MMapIndexInputType); + PyModule_AddObject(m, "MMapIndexInput", (PyObject *)&MMapIndexInputType); + + return; + +error: + Py_XDECREF(str_read_byte); + Py_XDECREF(str_read_bytes); + Py_XDECREF(eight); + Py_XDECREF(four); + Py_XDECREF(zero); + Py_XDECREF(mmap_module); + Py_XDECREF(mmap_module_name); + Py_XDECREF(threading_module); + Py_XDECREF(threading_module_name); + return; +} Property changes on: src/lucille/_storemodule.c ___________________________________________________________________ Name: svn:eol-style + native Index: src/lucille/store.py =================================================================== --- src/lucille/store.py (.../trunk) (revision 115) +++ src/lucille/store.py (.../branches/native_IndexInput) (revision 115) @@ -22,6 +22,8 @@ import threading from StringIO import StringIO +from lucille._store import BaseIndexInput, MMapIndexInput + # TODO locking """ Lucene Directory protocol @@ -74,31 +76,33 @@ def open_input(self, name): fullname = os.path.join(self.path, name) - return IndexInput(fullname, os.path.getsize(fullname)) + try: + return MMapIndexInput(open(fullname, 'rb'), os.path.getsize(fullname)) + except (EnvironmentError, OSError, IOError): + return IndexInput(fullname, os.path.getsize(fullname)) -class IndexInput(object): +class IndexInput(BaseIndexInput): """ filepos_lock should be acquired by callers around blocks which expect the file position to remain consistent. """ - __slots__ = ('filename', 'file', 'length', 'filepos_lock') + __slots__ = ('filename', 'file', 'length', 'offset') - def __init__(self, filename, length): + def __init__(self, filename, length, **kwargs): + super(IndexInput, self).__init__() self.filename = filename self.length = length self.file = open(self.filename, 'rb') - self.filepos_lock = threading.RLock() - self.seek(0) # for subclasses + self.offset = 0 - def __len__(self): return self.length - def seek(self, pos): - self.file.seek(pos) + if pos > self.length: raise EOFError() + self.file.seek(self.offset + pos) def tell(self): - return self.file.tell() + return self.file.tell() - self.offset def clone(self): """ Returns a new IndexInput which reads from the same file as this @@ -106,70 +110,12 @@ copied from this one. """ p = self.tell() cl = self.__class__(self.filename, self.length) + cl.offset = self.offset cl.seek(p) return cl def read_byte(self): - # XXX could use mmap instead? return ord(self.file.read(1)) def read_bytes(self, n): return self.file.read(n) - - def read_int(self): - n = (self.read_byte() << 24 | self.read_byte() << 16 | - self.read_byte() << 8 | self.read_byte()) - if n > 0xefffffff: - return n - 0xffffffff - 1 - else: - return n - - def read_long(self): - return (self.read_int() << 32 | - self.read_byte() << 24 | self.read_byte() << 16 | - self.read_byte() << 8 | self.read_byte()) - - def read_vint(self): - """ Lucene also defines #readVLong(), but read_vint can be used for - both. """ - val = 0 - shift = 0 - while True: - b = self.read_byte() - val += (b & 127) << shift - shift += 7 - if not (b & 128): - return val - - def read_chars(self, n): - """ Reads n characters from the underlying file and returns them as a unicode object. - - Lucene uses Java's "modified UTF-8": . """ - buff = StringIO() - while n > 0: - b = self.read_byte() - if (b & 0x80) == 0: - buff.write(unichr(b & 0x7F)) - elif (b & 0xE0) != 0xE0: - buff.write(unichr(((b & 0x1F) << 6) | (self.read_byte() & 0x3F))) - else: - buff.write(unichr(((b & 0x0F) << 12) | ((self.read_byte() & 0x3F) << 6) | (self.read_byte() & 0x3F))) - n -= 1 - return buff.getvalue() - - def skip_chars(self, n): - """ Like read_chars, but throws away the bytes. """ - while n > 0: - b = self.read_byte() - if (b & 0x80) == 0: - pass - elif (b & 0xE0) != 0xE0: - self.read_byte() - else: - self.read_byte() - self.read_byte() - n -= 1 - - def read_string(self): - length = self.read_vint() - return self.read_chars(length) Index: src/lucille/Makefile =================================================================== --- src/lucille/Makefile (.../trunk) (revision 115) +++ src/lucille/Makefile (.../branches/native_IndexInput) (revision 115) @@ -1,11 +1,14 @@ CFLAGS += -fno-strict-aliasing -fPIC $(shell python-config --includes) .PHONY: all -all: _util.so +all: _util.so _store.so _util.so: _utilmodule.o $(CC) -shared -o $@ $< +_store.so: _storemodule.o + $(CC) -shared -o $@ $< + .PHONY: clean clean: - rm -f _util.so *.o + rm -f _util.so _store.so *.o Index: src/lucille/index.py =================================================================== --- src/lucille/index.py (.../trunk) (revision 115) +++ src/lucille/index.py (.../branches/native_IndexInput) (revision 115) @@ -198,27 +198,6 @@ return find_segments_file(directory, _do_read_current_version) -class OffsetIndexInput(IndexInput): - - __slots__ = ('offset') - - def __init__(self, filename, length, offset): - self.offset = offset - IndexInput.__init__(self, filename, length) - - def seek(self, pos): - if pos > self.length: raise EOFError() - IndexInput.seek(self, pos + self.offset) - - def tell(self): - return (IndexInput.tell(self) - self.offset) - - def clone(self): - p = self.tell() - cl = OffsetIndexInput(self.filename, self.length, self.offset) - cl.seek(p) - return cl - class CompoundFileReader(object): """ implements Directory protocol """ @@ -259,11 +238,11 @@ return self.entries[name][1] def open_input(self, name): - # this only happens to work because Python's seek (apparently) doesn't - # use underlying OS seek, since file offsets are shared between - # dup()ped fds -- this might prove problematic! - return OffsetIndexInput(self.input.filename, - self.entries[name][1], self.entries[name][0]) + input = self.input.clone() + input.offset = self.entries[name][0] + input.length = self.entries[name][1] + input.seek(0) + return input class SegmentInfo(object):