intake · sodre · Jul 17, 2020 · Jul 17, 2020 · Jul 17, 2020 · Jul 17, 2020
diff --git a/intake_solr/__init__.py b/intake_solr/__init__.py
@@ -1,7 +1,4 @@
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
-
-import intake  # Import this first to avoid circular imports during discovery.
-del intake
 from .source import SOLRSequenceSource, SOLRTableSource
diff --git a/intake_solr/source.py b/intake_solr/source.py
@@ -1,3 +1,5 @@
+import math
+
 from intake.source import base
 import pandas as pd
 import pysolr
@@ -28,18 +30,25 @@ class SOLRSequenceSource(base.DataSource):
     zoocollection: bool or str
         If using Zookeeper to orchestrate SOLR, this is the name of the
         collection to connect to.
+    partition_len: int or None
+        The desired partition size. [default: 1024]
     """
     container = 'python'
     name = 'solr'
     version = __version__
-    partition_access = False
+    partition_access = True
 
     def __init__(self, query, base_url, core, qargs=None, metadata=None,
-                 auth=None, cert=None, zoocollection=False):
+                 auth=None, cert=None, zoocollection=False,
+                 partition_len=1024):
         self.query = query
         self.qargs = qargs or {}
         self.metadata = metadata or {}
         self._schema = None
+        self.partition_len = partition_len
+
+        if partition_len and partition_len <= 0:
+            raise ValueError(f"partition_len must be None or positive, got {partition_len}")
         if auth == 'kerberos':
             from requests_kerberos import HTTPKerberosAuth, OPTIONAL
             auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL,
@@ -64,24 +73,57 @@ def __init__(self, query, base_url, core, qargs=None, metadata=None,
         super(SOLRSequenceSource, self).__init__(metadata=metadata)
 
     def _get_schema(self):
-        return base.Schema(datashape=None,
-                           dtype=None,
-                           shape=None,
-                           npartitions=1,
-                           extra_metadata={})
+        """Do a 0 row query and get the number of hits from the response"""
+        qargs = self.qargs.copy()
+        qargs["rows"] = 0
+        start = qargs.get("start", 0)
+        results = self.solr.search(self.query, **qargs)
+
+        if self.partition_len is None:
+            npartitions = 1
+        else:
+            npartitions = math.ceil((results.hits - start) / self.partition_len)
+
+        return base.Schema(
+            datashape=None,
+            dtype=None,
+            shape=(results.hits - start,),
+            npartitions=npartitions,
+            extra_metadata={},
+        )
 
-    def _do_query(self):
+    def _do_query(self, index):
+        qargs = self.qargs.copy()
+        if self.partition_len is not None:
+            qargs["start"] = qargs.get("start", 0) + index * self.partition_len
+            qargs["rows"] = self.partition_len
+        return self.solr.search(self.query, **qargs)
+
+    def _get_partition(self, index):
+        """Downloads all data in query response"""
+        solr_rv = self._do_query(index)
         out = []
-        data = self.solr.search(self.query, **self.qargs).docs
-        for d in data:
+        for d in solr_rv.docs:
             out.append({k: (v[0] if isinstance(v, (tuple, list)) else v)
                         for k, v in d.items()})
         return out
 
-    def _get_partition(self, _):
-        """Downloads all data
-        """
-        return self._do_query()
+    def _close(self):
+        pass
+
+    def read(self):
+        self._load_metadata()
+        from itertools import chain
+        return chain(*(self._get_partition(index) for index in range(self.npartitions)))
+
+    def to_dask(self):
+        from dask import delayed
+        import dask.bag
+
+        self._load_metadata()
+        return dask.bag.from_delayed(
+            [delayed(self.read_partition)(i) for i in range(self.npartitions)]
+        )
 
 
 class SOLRTableSource(SOLRSequenceSource):
@@ -108,32 +150,42 @@ class SOLRTableSource(SOLRSequenceSource):
     zoocollection: bool or str
         If using Zookeeper to orchestrate SOLR, this is the name of the
         collection to connect to.
+    partition_len: int or None
+        The desired partition size. [default: 1024]
     """
 
     name = 'solrtab'
     container = 'dataframe'
+    partition_access = True
 
     def _get_schema(self, retry=2):
         """Get schema from first 10 hits or cached dataframe"""
-        if not hasattr(self, '_dataframe'):
-            self._get_partition(0)
-        dtype = {k: str(v)
-                 for k, v in self._dataframe.dtypes.to_dict().items()}
-        return base.Schema(datashape=None,
-                           dtype=dtype,
-                           shape=self._dataframe.shape,
-                           npartitions=1,
-                           extra_metadata={})
-
-    def _get_partition(self, _):
-        """Downloads all data
+        schema = super()._get_schema()
+
+        df = self._get_partition(0)
+        schema["dtype"] = {k: str(v)
+                           for k, v in df.dtypes.to_dict().items()}
+        schema["shape"] = (schema["shape"][0], *df.shape[1:])
+        return schema
+
+    def _get_partition(self, index):
+        """Downloads all data in the partition
         """
-        if not hasattr(self, '_dataframe'):
-            df = pd.DataFrame(self._do_query())
-            self._dataframe = df
-            self._schema = None
-            self.discover()
-        return self._dataframe
+        seq = super()._get_partition(index)
+        # Columns are sorted unless the user defines the field list (fl)
+        columns = self.qargs["fl"] if "fl" in self.qargs else sorted(seq[0].keys())
+        return pd.DataFrame(seq, columns=columns)
+
+    def read(self):
+        self._load_metadata()
+        return pd.concat(self._get_partition(index) for index in range(self.npartitions))
+
+    def to_dask(self):
+        from dask import delayed
+        import dask.dataframe
+
+        self._load_metadata()
+        return dask.dataframe.from_delayed(
+            [delayed(self.read_partition)(i) for i in range(self.npartitions)]
+        )
 
-    def _close(self):
-        self._dataframe = None
diff --git a/tests/test_intake_solr.py b/tests/test_intake_solr.py
@@ -9,7 +9,7 @@
 from .util import start_solr, stop_docker, TEST_CORE
 
 CONNECT = {'host': 'localhost', 'port': 9200}
-TEST_DATA_DIR = 'tests'
+TEST_DATA_DIR = os.path.abspath(os.path.dirname(__file__))
 TEST_DATA = 'sample1.csv'
 df = pd.read_csv(os.path.join(TEST_DATA_DIR, TEST_DATA))