From 3563a03d0996b91aa77d084e43d35bca12150d08 Mon Sep 17 00:00:00 2001
From: Julian Smith <julian.smith@artifex.com>
Date: Sat, 18 May 2024 23:23:39 +0100
Subject: [PATCH 1/6] pipcl.py: minor changes to diagnostics.

Also removed trailing white space.
---
 pipcl.py | 61 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)
diff --git a/pipcl.py b/pipcl.py
index 0434e8af9..3680ca53b 100644
--- a/pipcl.py
+++ b/pipcl.py
@@ -237,21 +237,21 @@ class Package:
         >>> assert len(so) == 1
         >>> so = so[0]
         >>> assert os.path.getmtime(so) > t0
-    
+
     Check `entry_points` causes creation of command `foo_cli` when we install
     from our wheel using pip. [As of 2024-02-24 using pipcl's CLI interface
     directly with `setup.py install` does not support entry points.]
-    
+
         >>> print('Creating venv.', file=sys.stderr)
         >>> _ = subprocess.run(
         ...         f'cd pipcl_test && {sys.executable} -m venv pylocal',
         ...         shell=1, check=1)
-        
+
         >>> print('Installing from wheel into venv using pip.', file=sys.stderr)
         >>> _ = subprocess.run(
         ...         f'. pipcl_test/pylocal/bin/activate && pip install pipcl_test/dist/*.whl',
         ...         shell=1, check=1)
-        
+
         >>> print('Running foo_cli.', file=sys.stderr)
         >>> _ = subprocess.run(
         ...         f'. pipcl_test/pylocal/bin/activate && foo_cli',
@@ -299,7 +299,7 @@ def __init__(self,
             requires_external = None,
             project_url = None,
             provides_extra = None,
-            
+
             entry_points = None,
 
             root = None,
@@ -374,21 +374,21 @@ def __init__(self,
             entry_points:
                 String or dict specifying *.dist-info/entry_points.txt, for
                 example:
-                
+
                     ```
                     [console_scripts]
                     foo_cli = foo.__main__:main
                     ```
-                
+
                 or:
-                
+
                     { 'console_scripts': [
                         'foo_cli = foo.__main__:main',
                         ],
                     }
-                
+
                 See: https://packaging.python.org/en/latest/specifications/entry-points/
-            
+
             root:
                 Root of package, defaults to current directory.
 
@@ -684,7 +684,7 @@ def add_str(content, to_):
             # Add <name>-<version>.dist-info/COPYING.
             if self.license:
                 add_str(self.license, f'{dist_info_dir}/COPYING')
-            
+
             # Add <name>-<version>.dist-info/entry_points.txt.
             entry_points_text = self._entry_points_text()
             if entry_points_text:
@@ -735,15 +735,15 @@ def build_sdist(self,
         os.makedirs(sdist_directory, exist_ok=True)
         tarpath = f'{sdist_directory}/{prefix}.tar.gz'
         log2(f'Creating sdist: {tarpath}')
-        
+
         with tarfile.open(tarpath, 'w:gz') as tar:
-            
+
             names_in_tar = list()
             def check_name(name):
                 if name in names_in_tar:
                     raise Exception(f'Name specified twice: {name}')
                 names_in_tar.append(name)
-            
+
             def add(from_, name):
                 check_name(name)
                 if isinstance(from_, str):
@@ -757,7 +757,7 @@ def add(from_, name):
                     tar.addfile(ti, io.BytesIO(from_))
                 else:
                     assert 0
-        
+
             def add_string(text, name):
                 textb = text.encode('utf8')
                 return add(textb, name)
@@ -776,7 +776,7 @@ def add_string(text, name):
                     if to_rel == 'pyproject.toml':
                         found_pyproject_toml = True
                     add(from_, to_rel)
-            
+
             if not found_pyproject_toml:
                 log0(f'Warning: no pyproject.toml specified.')
 
@@ -884,10 +884,10 @@ def add_str(content, to_abs, to_rel):
             add_file( from_, to_abs2, to_rel)
 
         add_str( self._metainfo(), f'{root2}/{dist_info_dir}/METADATA', f'{dist_info_dir}/METADATA')
-        
+
         if self.license:
             add_str( self.license, f'{root2}/{dist_info_dir}/COPYING', f'{dist_info_dir}/COPYING')
-        
+
         entry_points_text = self._entry_points_text()
         if entry_points_text:
             add_str(
@@ -1284,7 +1284,7 @@ def _fromto(self, p):
         if isinstance(p, str):
             p = p, p
         assert isinstance(p, tuple) and len(p) == 2
-        
+
         from_, to_ = p
         assert isinstance(from_, (str, bytes))
         assert isinstance(to_, str)
@@ -1798,7 +1798,7 @@ def git_items( directory, submodules=False):
     return ret
 
 
-def run( command, capture=False, check=1):
+def run( command, capture=False, check=1, verbose=1):
     '''
     Runs a command using `subprocess.run()`.
 
@@ -1818,6 +1818,8 @@ def run( command, capture=False, check=1):
         check:
             If true we raise an exception on error; otherwise we include the
             command's returncode in our return value.
+        verbose:
+            If true we show the command.
     Returns:
         check capture   Return
         --------------------------
@@ -1828,7 +1830,8 @@ def run( command, capture=False, check=1):
     '''
     lines = _command_lines( command)
     nl = '\n'
-    log2( f'Running: {nl.join(lines)}')
+    if verbose:
+        log1( f'Running: {nl.join(lines)}')
     sep = ' ' if windows() else '\\\n'
     command2 = sep.join( lines)
     cp = subprocess.run(
@@ -1924,18 +1927,18 @@ def __init__(self):
                                 stderr=subprocess.DEVNULL,
                                 check=0,
                                 ).returncode
-                        log1(f'{e=} from {pc!r}.')
+                        log2(f'{e=} from {pc!r}.')
                         if e == 0:
                             python_config = pc
                     assert python_config, f'Cannot find python-config'
                 else:
                     python_config = f'{python_exe}-config'
-            log1(f'Using {python_config=}.')
+            log2(f'Using {python_config=}.')
             try:
-                self.includes = run( f'{python_config} --includes', capture=1).strip()
+                self.includes = run( f'{python_config} --includes', capture=1, verbose=0).strip()
             except Exception as e:
                 raise Exception('We require python development tools to be installed.') from e
-            self.ldflags = run( f'{python_config} --ldflags', capture=1).strip()
+            self.ldflags = run( f'{python_config} --ldflags', capture=1, verbose=0).strip()
             if linux():
                 # It seems that with python-3.10 on Linux, we can get an
                 # incorrect -lcrypt flag that on some systems (e.g. WSL)
@@ -2100,7 +2103,7 @@ def run_if( command, out, *prerequisites):
     if not doit:
         out_mtime = _fs_mtime( out)
         if out_mtime == 0:
-            doit = 'File does not exist: {out!e}'
+            doit = f'File does not exist: {out!r}'
 
     cmd_path = f'{out}.cmd'
     if os.path.isfile( cmd_path):
@@ -2155,7 +2158,7 @@ def _make_prerequisites(p):
             os.remove( cmd_path)
         except Exception:
             pass
-        log2( f'Running command because: {doit}')
+        log1( f'Running command because: {doit}')
 
         run( command)
 
@@ -2164,7 +2167,7 @@ def _make_prerequisites(p):
             f.write( command)
         return True
     else:
-        log2( f'Not running command because up to date: {out!r}')
+        log1( f'Not running command because up to date: {out!r}')
 
     if 0:
         log2( f'out_mtime={time.ctime(out_mtime)} pre_mtime={time.ctime(pre_mtime)}.'
@@ -2361,7 +2364,7 @@ def add_content(self, content, to_, verbose=True):
             log2(f'Adding {to_}')
 
     def add_file(self, from_, to_):
-        log2(f'Adding file: {os.path.relpath(from_)} => {to_}')
+        log1(f'Adding file: {os.path.relpath(from_)} => {to_}')
         with open(from_, 'rb') as f:
             content = f.read()
         self.add_content(content, to_, verbose=False)

From 7c2a475593f2dd4023b2d0d1df389a272da4425d Mon Sep 17 00:00:00 2001
From: Julian Smith <julian.smith@artifex.com>
Date: Sat, 18 May 2024 23:23:52 +0100
Subject: [PATCH 2/6] setup.py: reduced diagnostics.

Also put generated files in src/build/, so that they are not remove by `git
clean` unless `-d` is specified.
---
 setup.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index e2b426015..c9e39e9aa 100755
--- a/setup.py
+++ b/setup.py
@@ -571,13 +571,6 @@ def build():
             build_type,
             )
     
-    for d in (
-            mupdf_build_dir,
-            f'{g_root}/src',
-            ):
-        if d:
-            run(f'ls -l {os.path.relpath(d)}', check=0)
-    
     # Generate lists of (from, to) items to return to pipcl. We put MuPDF
     # shared libraries in a separate list so that we can build specific wheels
     # as determined by g_flavour.
@@ -622,8 +615,8 @@ def add( ret, from_, to_):
         add( ret_p, f'{g_root}/src/pymupdf.py', to_dir)
         add( ret_p, f'{g_root}/src/table.py', to_dir)
         add( ret_p, f'{g_root}/src/utils.py', to_dir)
-        add( ret_p, f'{g_root}/src/extra.py', to_dir)
-        add( ret_p, f'{g_root}/src/{path_so_leaf_b}', to_dir)
+        add( ret_p, f'{g_root}/src/build/extra.py', to_dir)
+        add( ret_p, f'{g_root}/src/build/{path_so_leaf_b}', to_dir)
         
         if mupdf_local:
             add( ret_p, f'{mupdf_build_dir}/mupdf.py', to_dir)
@@ -659,8 +652,6 @@ def add( ret, from_, to_):
     else:
         add( ret, f'{g_root}/README.md', '$dist-info/README.md')
     
-    for f, t in ret:
-        log( f'build(): {f} => {t}')
     return ret
 
 
@@ -963,7 +954,7 @@ def _build_extension_rebased( mupdf_local, mupdf_build_dir, build_type):
     path_so_leaf_b = pipcl.build_extension(
             name = 'extra',
             path_i = f'{g_root}/src/extra.i',
-            outdir = f'{g_root}/src',
+            outdir = f'{g_root}/src/build',
             includes = includes,
             defines = defines,
             libpaths = libpaths,

From 1296fba0984fd372e1c99b4b9d76d0c31c229cf8 Mon Sep 17 00:00:00 2001
From: Julian Smith <julian.smith@artifex.com>
Date: Mon, 20 May 2024 18:20:07 +0100
Subject: [PATCH 3/6] src/__init__.py: patch one extra line to use _format_g().

This patches up one place that was missed in earlier commit.
---
 src/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/__init__.py b/src/__init__.py
index 3f307a7db..cc1e65aa1 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -8994,7 +8994,8 @@ def remove_rotation(self):
 
         # prefix with derotation matrix
         mat = mat0 * self.derotation_matrix
-        cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
+        cmd = _format_g(tuple(mat)) + ' cm '
+        cmd = cmd.encode('utf8')
         _ = TOOLS._insert_contents(self, cmd, False)  # prepend to page contents
 
         # swap x- and y-coordinates

From 35332d01e9820a39418bf8d2fa920aa14800fe44 Mon Sep 17 00:00:00 2001
From: Julian Smith <julian.smith@artifex.com>
Date: Mon, 20 May 2024 18:22:18 +0100
Subject: [PATCH 4/6] src/__init__.py tests/conftest.py: check no calls to
 log() when running tests.

Also changed exception_info() to call log() instead of writing directly to
_g_out_log, so that exception backtraces are also checked.
---
 src/__init__.py   | 9 +++++++--
 tests/conftest.py | 4 ++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/__init__.py b/src/__init__.py
index cc1e65aa1..78d38dcd7 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -61,6 +61,8 @@ def _set_stream(name, default):
 _g_out_log = _set_stream('PYMUPDF_LOG', sys.stdout)
 _g_out_message = _set_stream('PYMUPDF_MESSAGE', sys.stdout)
 
+# Set to list() if we are in test suite.
+_g_log_items = None
 
 def log( text='', caller=1):
     '''
@@ -70,7 +72,10 @@ def log( text='', caller=1):
     filename    = os.path.relpath(frame_record.filename)
     line        = frame_record.lineno
     function    = frame_record.function
-    print( f'{filename}:{line}:{function}: {text}', file=_g_out_log)
+    text = f'{filename}:{line}:{function}: {text}'
+    if _g_log_items is not None:
+        _g_log_items.append(text)
+    print(text, file=_g_out_log)
     _g_out_log.flush()
 
 
@@ -85,7 +90,7 @@ def message(text=''):
 def exception_info():
     import traceback
     log(f'exception_info:')
-    traceback.print_exc(file=_g_out_log)
+    log(traceback.format_exc())
 
 
 # PDF names must not contain these characters:
diff --git a/tests/conftest.py b/tests/conftest.py
index 3aba1c948..76ef90ef7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,8 @@ def wrap(*args, **kwargs):
     assert not wt, f'{wt=}'
     assert not pymupdf.TOOLS.set_small_glyph_heights()
     
+    pymupdf._g_log_items = list()
+    
     # Run the test.
     rep = yield
     
@@ -23,3 +25,5 @@ def wrap(*args, **kwargs):
         assert not wt, f'Warnings text not empty: {wt=}'
     
     assert not pymupdf.TOOLS.set_small_glyph_heights()
+    
+    assert not pymupdf._g_log_items, f'log() was called; {len(pymupdf._g_log_items)=}.'

From 798991977821e2d3a8c6ee880b51d58cd63edc2c Mon Sep 17 00:00:00 2001
From: Julian Smith <julian.smith@artifex.com>
Date: Mon, 20 May 2024 17:38:05 +0100
Subject: [PATCH 5/6] src/utils.py: disable various expected calls to
 pymupdf.exception_info().

Addresses #3479 and #3488.
---
 src/utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index b4fdd43ea..f26ffe5cd 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -933,7 +933,7 @@ def getLinkDict(ln, document=None) -> dict:
         nl["from"] = ln.rect
     except Exception:
         # This seems to happen quite often in PyMuPDF/tests.
-        if g_exceptions_verbose:    pymupdf.exception_info()
+        if g_exceptions_verbose >= 2:   pymupdf.exception_info()
         pass
     pnt = pymupdf.Point(0, 0)
     if dest.flags & pymupdf.LINK_FLAG_L_VALID:
@@ -1444,46 +1444,46 @@ def set_toc(
             txt += ol["dest"]
         except Exception:
             # Verbose in PyMuPDF/tests.
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
         try:
             if ol["first"] > -1:
                 txt += "/First %i 0 R" % xref[ol["first"]]
         except Exception:
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
         try:
             if ol["last"] > -1:
                 txt += "/Last %i 0 R" % xref[ol["last"]]
         except Exception:
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
         try:
             if ol["next"] > -1:
                 txt += "/Next %i 0 R" % xref[ol["next"]]
         except Exception:
             # Verbose in PyMuPDF/tests.
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
         try:
             if ol["parent"] > -1:
                 txt += "/Parent %i 0 R" % xref[ol["parent"]]
         except Exception:
             # Verbose in PyMuPDF/tests.
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
         try:
             if ol["prev"] > -1:
                 txt += "/Prev %i 0 R" % xref[ol["prev"]]
         except Exception:
             # Verbose in PyMuPDF/tests.
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
         try:
             txt += "/Title" + ol["title"]
         except Exception:
             # Verbose in PyMuPDF/tests.
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             pass
 
         if ol.get("color") and len(ol["color"]) == 3:
@@ -4686,8 +4686,7 @@ def output_justify(start, line):
         try:
             line, tl = new_lines.pop(0)
         except IndexError:
-            # Verbose in PyMuPDF/tests.
-            if g_exceptions_verbose:    pymupdf.exception_info()
+            if g_exceptions_verbose >= 2:   pymupdf.exception_info()
             break
 
         if right_to_left:  # Arabic, Hebrew

From 081ca67044dd240b04ff911a42bca95f9d34ba46 Mon Sep 17 00:00:00 2001
From: Julian Smith <julian.smith@artifex.com>
Date: Thu, 16 May 2024 13:17:05 +0100
Subject: [PATCH 6/6] Add pymupdf.get_text() - optionally concurrent call of
 Page.get_text() on some/all pages.

setup.py
    Add new _get_text.py to wheels/installs.
src/__init__.py
    New top-level get_text() fn, calls _get_text.get(text).
src/_get_text.py
    New, contains implementation of get_text().
tests/test_pylint.py
    Avoid pylint failure by disabling `R0801: Similar lines in 2 files`.
tests/test_textextract.py
    Test get_text() and show timings.

Timings for MacOS-arm64 and PDF spec:
    method='multiprocessing' : 3.3x.
    method='fork': 3.6x.
---
 setup.py                  |  11 ++-
 src/__init__.py           |  73 +++++++++++++++++++
 src/_get_text.py          | 144 ++++++++++++++++++++++++++++++++++++++
 tests/test_pylint.py      |   5 +-
 tests/test_textextract.py |  37 ++++++++++
 5 files changed, 266 insertions(+), 4 deletions(-)
 create mode 100644 src/_get_text.py

diff --git a/setup.py b/setup.py
index c9e39e9aa..9a6116753 100755
--- a/setup.py
+++ b/setup.py
@@ -606,21 +606,26 @@ def add( ret, from_, to_):
 
     if path_so_leaf_b:
         # Add rebased implementation files.
-        add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py')   # For `fitz` module alias.
-        add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py')         # For `fitz` module alias.
-        add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py')         # For `fitz` module alias.
         to_dir = 'pymupdf/'
         add( ret_p, f'{g_root}/src/__init__.py', to_dir)
         add( ret_p, f'{g_root}/src/__main__.py', to_dir)
         add( ret_p, f'{g_root}/src/pymupdf.py', to_dir)
         add( ret_p, f'{g_root}/src/table.py', to_dir)
         add( ret_p, f'{g_root}/src/utils.py', to_dir)
+        add( ret_p, f'{g_root}/src/_get_text.py', to_dir)
         add( ret_p, f'{g_root}/src/build/extra.py', to_dir)
         add( ret_p, f'{g_root}/src/build/{path_so_leaf_b}', to_dir)
         
+        # Add support for `fitz` backwards compatibility.
+        add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py')
+        add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py')
+        add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py')
+        
         if mupdf_local:
+            # Add MuPDF Python API.
             add( ret_p, f'{mupdf_build_dir}/mupdf.py', to_dir)
             
+            # Add MuPDF shared libraries.
             if windows:
                 wp = pipcl.wdev.WindowsPython()
                 add( ret_p, f'{mupdf_build_dir}/_mupdf.pyd', to_dir)
diff --git a/src/__init__.py b/src/__init__.py
index 78d38dcd7..e5381f2f0 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -20942,6 +20942,79 @@ def vdist(dir, a, b):
     return mupdf.fz_abs(dx * dir.y + dy * dir.x)
 
 
+def get_text(
+        path,
+        *,
+        pages=None,
+        method='single',
+        concurrency=None,
+        
+        option='text',
+        clip=None,
+        flags=None,
+        textpage=None,
+        sort=False,
+        delimiters=None,
+        ):
+    '''
+    Returns list of results from `Page.get_text()`, optionally using
+    concurrency for speed.
+    
+    Args:
+        path:
+            Path of document.
+        pages:
+            List of page numbers to process, or None to include all pages.
+        method:
+            'single'
+                Do not use concurrency.
+            'mp'
+                Operate concurrently using Python's `multiprocessing` module.
+            'fork'
+                 Operate concurrently using custom implementation with
+                 `os.fork`. Does not work on Windows.
+        concurrency:
+            Number of worker processes to use when operating concurrently. If
+            None, we use the number of available CPUs.
+        option clip flags textpage sort delimiters:
+            Passed to internal calls to `Page.get_text()`.
+    '''
+    args_dict = dict(
+            option=option,
+            clip=clip,
+            flags=flags,
+            textpage=textpage,
+            sort=sort,
+            delimiters=delimiters,
+            )
+    
+    if method == 'single':
+        ret = list()
+        document = Document(path)
+        for page in document:
+            text = page.get_text(**args_dict)
+            ret.append(text)
+        return ret
+    
+    # Use concurrency.
+    #
+    from . import _get_text
+    
+    if pages is None:
+        with Document(path) as document:
+            num_pages = len(document)
+            pages = list(range(num_pages))
+    
+    if method == 'mp':
+        return _get_text._get_text_mp(path, pages, concurrency, args_dict)
+    
+    elif method == 'fork':
+        return _get_text._get_text_fork(path, pages, concurrency, args_dict)
+        
+    else:
+        assert 0, f'Unrecognised {method=}.'
+
+
 class TOOLS:
     '''
     We use @staticmethod to avoid the need to create an instance of this class.
diff --git a/src/_get_text.py b/src/_get_text.py
new file mode 100644
index 000000000..c98d68fde
--- /dev/null
+++ b/src/_get_text.py
@@ -0,0 +1,144 @@
+import multiprocessing
+import os
+
+import pymupdf
+
+
+# Support for `method='multiprocessing'`.
+#
+# By default each `multiprocessing` worker process would create a `Document`
+# each time it was asked to process a page. We avoid this by using a global
+# `Document` instance. Haven't found a more elegant way - putting state
+# into a class on the server before creating workers doesn't work because
+# multiprocessing appears to always send the server's state in each iteration.
+#
+# It's not too bad because this global state is only ever used by workers, so
+# doesn't actually limit things in general.
+#
+_mp_worker_path = None
+_mp_worker_document = None
+_mp_worker_args_dict = None
+
+def _mp_worker_init(path, args_dict):
+    global _mp_worker_path
+    global _mp_worker_args_dict
+    assert _mp_worker_path is None
+    assert _mp_worker_args_dict is None
+    _mp_worker_path = path
+    _mp_worker_args_dict = args_dict
+
+def _mp_worker(page_number):
+    global _mp_worker_document
+    if not _mp_worker_document:
+        _mp_worker_document = pymupdf.Document(_mp_worker_path)
+    page = _mp_worker_document[page_number]
+    ret = page.get_text(**_mp_worker_args_dict)
+    return ret
+
+
+def _get_text_mp(
+        path,
+        pages,
+        concurrency,
+        args_dict,
+        ):
+    with multiprocessing.Pool(
+            concurrency,
+            _mp_worker_init,
+            (path, args_dict),
+            ) as pool:
+        result = pool.map_async(_mp_worker, pages)
+        return result.get()
+
+
+def _get_text_fork(
+        path,
+        pages,
+        concurrency,
+        args_dict,
+        ):
+    '''
+    Implementation for `method='fork'`.
+    '''
+    verbose = 0
+    if concurrency is None:
+        concurrency = multiprocessing.cpu_count()
+    # We send page numbers to queue_pc and collect (page_num, text) from
+    # queue_cp. Workers each repeatedly take the next available page number
+    # from queue_pc, extract the text and put it onto queue_cp.
+    #
+    # This is better than pre-allocating a subset of pages to each worker
+    # because it ensures there will never be idle workers until we are near
+    # the end with fewer pages left than workers.
+    #
+    queue_pc = multiprocessing.Queue()
+    queue_cp = multiprocessing.Queue()
+    
+    def childfn():
+        document = None
+        while 1:
+            if verbose: pymupdf.log(f'{os.getpid()=}: calling get().')
+            page_num = queue_pc.get()
+            if verbose: pymupdf.log(f'{os.getpid()=}: {page_num=}.')
+            if page_num is None:
+                break
+            try:
+                if document is None:
+                    document = pymupdf.Document(path)
+                page = document[page_num]
+                ret = page.get_text(**args_dict)
+            except Exception as e:
+                ret = e
+            queue_cp.put( (page_num, ret) )
+
+    error = None
+
+    # Start child processes.
+    pids = list()
+    try:
+        for i in range(concurrency):
+            p = os.fork()   # pylint: disable=no-member
+            if p == 0:
+                # Child process.
+                try:
+                    childfn()
+                finally:
+                    if verbose: pymupdf.log(f'{os.getpid()=}: calling os._exit(0)')
+                    os._exit(0)
+            pids.append(p)
+
+        # Send page numbers.
+        for page_num in range(len(pages)):
+            queue_pc.put(page_num)
+
+        # Collect results.
+        ret = [None] * len(pages)
+        for i in range(len(pages)):
+            page_num, text = queue_cp.get()
+            if verbose: pymupdf.log(f'{page_num=} {len(text)=}')
+            assert ret[page_num] is None
+            if isinstance(text, Exception):
+                if not error:
+                    error = text
+                break
+            ret[page_num] = text
+
+        # Close queue. This should cause exception in workers and terminate
+        # them, but on macos-arm64 this does not seem to happen, so we also
+        # send None, which makes workers terminate.
+        for i in range(concurrency):
+            queue_pc.put(None)
+        if verbose: pymupdf.log(f'Closing queues.')
+        queue_pc.close()
+
+        if error:
+            raise error
+        if verbose: pymupdf.log(f'After concurrent, returning {len(ret)=}')
+        return ret
+        
+    finally:
+        # Join all child proceses.
+        for pid in pids:
+            if verbose: pymupdf.log(f'waiting for {pid=}.')
+            e = os.waitpid(pid, 0)
+            if verbose: pymupdf.log(f'{pid=} => {e=}')
diff --git a/tests/test_pylint.py b/tests/test_pylint.py
index 82e4305c2..6428b5fd1 100644
--- a/tests/test_pylint.py
+++ b/tests/test_pylint.py
@@ -35,6 +35,7 @@ def test_pylint():
             W0622: Redefining built-in 'FileNotFoundError' (redefined-builtin)
             W0622: Redefining built-in 'open' (redefined-builtin)
             W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation)
+            R1734: Consider using [] instead of list() (use-list-literal)
             '''
             )
     
@@ -80,6 +81,7 @@ def test_pylint():
             W0718: Catching too general exception Exception (broad-exception-caught)
             W0719: Raising too general exception: Exception (broad-exception-raised)
             C3001: Lambda expression assigned to a variable. Define a function using the "def" keyword instead. (unnecessary-lambda-assignment)
+            R0801: Similar lines in 2 files
             '''
             )
     ignores_list = list()
@@ -110,6 +112,7 @@ def test_pylint():
     leafs = [
             '__init__.py',
             '__main__.py',
+            '_get_text.py',
             'fitz___init__.py',
             'fitz_table.py',
             'fitz_utils.py',
@@ -117,7 +120,7 @@ def test_pylint():
             'table.py',
             'utils.py',
             ]
-    
+    leafs.sort()
     try:
         leafs_git = pipcl.git_items(directory)
     except Exception as e:
diff --git a/tests/test_textextract.py b/tests/test_textextract.py
index 22fc82917..7dc95c2cc 100644
--- a/tests/test_textextract.py
+++ b/tests/test_textextract.py
@@ -267,3 +267,40 @@ def test_3197():
                 assert text_utf8 == text_utf8_expected[i]
             else:
                 assert text_utf8 != text_utf8_expected[i]
+
+def test_document_text():
+    import platform
+    import time
+    
+    path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf')
+    concurrency = None
+    
+    def llen(texts):
+        l = 0
+        for text in texts:
+            l += len(text) if isinstance(text, str) else text
+        return l
+
+    print('')
+    method = 'single'
+    t = time.time()
+    document = pymupdf.Document(path)
+    texts1 = pymupdf.get_text(path)
+    t1 = time.time() - t
+    print(f'{method}: {t1=} {llen(texts1)=}', flush=1)
+
+    method = 'mp'
+    t = time.time()
+    texts2 = pymupdf.get_text(path, concurrency=concurrency, method=method)
+    t2 = time.time() - t
+    print(f'{method}: {concurrency=} {t2=} ({t1/t2:.2f}x) {llen(texts2)=}', flush=1)
+    assert texts2 == texts1
+
+    if platform.system() != 'Windows':
+        method = 'fork'
+        t = time.time()
+        texts3 = pymupdf.get_text(path, concurrency=concurrency, method='fork')
+        t3 = time.time() - t
+        print(f'{method}: {concurrency=} {t3=} ({t1/t3:.2f}x) {llen(texts3)=}', flush=1)
+        assert texts3 == texts1
+