From 3563a03d0996b91aa77d084e43d35bca12150d08 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Sat, 18 May 2024 23:23:39 +0100 Subject: [PATCH 1/6] pipcl.py: minor changes to diagnostics. Also removed trailing white space. --- pipcl.py | 61 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/pipcl.py b/pipcl.py index 0434e8af9..3680ca53b 100644 --- a/pipcl.py +++ b/pipcl.py @@ -237,21 +237,21 @@ class Package: >>> assert len(so) == 1 >>> so = so[0] >>> assert os.path.getmtime(so) > t0 - + Check `entry_points` causes creation of command `foo_cli` when we install from our wheel using pip. [As of 2024-02-24 using pipcl's CLI interface directly with `setup.py install` does not support entry points.] - + >>> print('Creating venv.', file=sys.stderr) >>> _ = subprocess.run( ... f'cd pipcl_test && {sys.executable} -m venv pylocal', ... shell=1, check=1) - + >>> print('Installing from wheel into venv using pip.', file=sys.stderr) >>> _ = subprocess.run( ... f'. pipcl_test/pylocal/bin/activate && pip install pipcl_test/dist/*.whl', ... shell=1, check=1) - + >>> print('Running foo_cli.', file=sys.stderr) >>> _ = subprocess.run( ... f'. pipcl_test/pylocal/bin/activate && foo_cli', @@ -299,7 +299,7 @@ def __init__(self, requires_external = None, project_url = None, provides_extra = None, - + entry_points = None, root = None, @@ -374,21 +374,21 @@ def __init__(self, entry_points: String or dict specifying *.dist-info/entry_points.txt, for example: - + ``` [console_scripts] foo_cli = foo.__main__:main ``` - + or: - + { 'console_scripts': [ 'foo_cli = foo.__main__:main', ], } - + See: https://packaging.python.org/en/latest/specifications/entry-points/ - + root: Root of package, defaults to current directory. @@ -684,7 +684,7 @@ def add_str(content, to_): # Add -.dist-info/COPYING. if self.license: add_str(self.license, f'{dist_info_dir}/COPYING') - + # Add -.dist-info/entry_points.txt. entry_points_text = self._entry_points_text() if entry_points_text: @@ -735,15 +735,15 @@ def build_sdist(self, os.makedirs(sdist_directory, exist_ok=True) tarpath = f'{sdist_directory}/{prefix}.tar.gz' log2(f'Creating sdist: {tarpath}') - + with tarfile.open(tarpath, 'w:gz') as tar: - + names_in_tar = list() def check_name(name): if name in names_in_tar: raise Exception(f'Name specified twice: {name}') names_in_tar.append(name) - + def add(from_, name): check_name(name) if isinstance(from_, str): @@ -757,7 +757,7 @@ def add(from_, name): tar.addfile(ti, io.BytesIO(from_)) else: assert 0 - + def add_string(text, name): textb = text.encode('utf8') return add(textb, name) @@ -776,7 +776,7 @@ def add_string(text, name): if to_rel == 'pyproject.toml': found_pyproject_toml = True add(from_, to_rel) - + if not found_pyproject_toml: log0(f'Warning: no pyproject.toml specified.') @@ -884,10 +884,10 @@ def add_str(content, to_abs, to_rel): add_file( from_, to_abs2, to_rel) add_str( self._metainfo(), f'{root2}/{dist_info_dir}/METADATA', f'{dist_info_dir}/METADATA') - + if self.license: add_str( self.license, f'{root2}/{dist_info_dir}/COPYING', f'{dist_info_dir}/COPYING') - + entry_points_text = self._entry_points_text() if entry_points_text: add_str( @@ -1284,7 +1284,7 @@ def _fromto(self, p): if isinstance(p, str): p = p, p assert isinstance(p, tuple) and len(p) == 2 - + from_, to_ = p assert isinstance(from_, (str, bytes)) assert isinstance(to_, str) @@ -1798,7 +1798,7 @@ def git_items( directory, submodules=False): return ret -def run( command, capture=False, check=1): +def run( command, capture=False, check=1, verbose=1): ''' Runs a command using `subprocess.run()`. @@ -1818,6 +1818,8 @@ def run( command, capture=False, check=1): check: If true we raise an exception on error; otherwise we include the command's returncode in our return value. + verbose: + If true we show the command. Returns: check capture Return -------------------------- @@ -1828,7 +1830,8 @@ def run( command, capture=False, check=1): ''' lines = _command_lines( command) nl = '\n' - log2( f'Running: {nl.join(lines)}') + if verbose: + log1( f'Running: {nl.join(lines)}') sep = ' ' if windows() else '\\\n' command2 = sep.join( lines) cp = subprocess.run( @@ -1924,18 +1927,18 @@ def __init__(self): stderr=subprocess.DEVNULL, check=0, ).returncode - log1(f'{e=} from {pc!r}.') + log2(f'{e=} from {pc!r}.') if e == 0: python_config = pc assert python_config, f'Cannot find python-config' else: python_config = f'{python_exe}-config' - log1(f'Using {python_config=}.') + log2(f'Using {python_config=}.') try: - self.includes = run( f'{python_config} --includes', capture=1).strip() + self.includes = run( f'{python_config} --includes', capture=1, verbose=0).strip() except Exception as e: raise Exception('We require python development tools to be installed.') from e - self.ldflags = run( f'{python_config} --ldflags', capture=1).strip() + self.ldflags = run( f'{python_config} --ldflags', capture=1, verbose=0).strip() if linux(): # It seems that with python-3.10 on Linux, we can get an # incorrect -lcrypt flag that on some systems (e.g. WSL) @@ -2100,7 +2103,7 @@ def run_if( command, out, *prerequisites): if not doit: out_mtime = _fs_mtime( out) if out_mtime == 0: - doit = 'File does not exist: {out!e}' + doit = f'File does not exist: {out!r}' cmd_path = f'{out}.cmd' if os.path.isfile( cmd_path): @@ -2155,7 +2158,7 @@ def _make_prerequisites(p): os.remove( cmd_path) except Exception: pass - log2( f'Running command because: {doit}') + log1( f'Running command because: {doit}') run( command) @@ -2164,7 +2167,7 @@ def _make_prerequisites(p): f.write( command) return True else: - log2( f'Not running command because up to date: {out!r}') + log1( f'Not running command because up to date: {out!r}') if 0: log2( f'out_mtime={time.ctime(out_mtime)} pre_mtime={time.ctime(pre_mtime)}.' @@ -2361,7 +2364,7 @@ def add_content(self, content, to_, verbose=True): log2(f'Adding {to_}') def add_file(self, from_, to_): - log2(f'Adding file: {os.path.relpath(from_)} => {to_}') + log1(f'Adding file: {os.path.relpath(from_)} => {to_}') with open(from_, 'rb') as f: content = f.read() self.add_content(content, to_, verbose=False) From 7c2a475593f2dd4023b2d0d1df389a272da4425d Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Sat, 18 May 2024 23:23:52 +0100 Subject: [PATCH 2/6] setup.py: reduced diagnostics. Also put generated files in src/build/, so that they are not remove by `git clean` unless `-d` is specified. --- setup.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index e2b426015..c9e39e9aa 100755 --- a/setup.py +++ b/setup.py @@ -571,13 +571,6 @@ def build(): build_type, ) - for d in ( - mupdf_build_dir, - f'{g_root}/src', - ): - if d: - run(f'ls -l {os.path.relpath(d)}', check=0) - # Generate lists of (from, to) items to return to pipcl. We put MuPDF # shared libraries in a separate list so that we can build specific wheels # as determined by g_flavour. @@ -622,8 +615,8 @@ def add( ret, from_, to_): add( ret_p, f'{g_root}/src/pymupdf.py', to_dir) add( ret_p, f'{g_root}/src/table.py', to_dir) add( ret_p, f'{g_root}/src/utils.py', to_dir) - add( ret_p, f'{g_root}/src/extra.py', to_dir) - add( ret_p, f'{g_root}/src/{path_so_leaf_b}', to_dir) + add( ret_p, f'{g_root}/src/build/extra.py', to_dir) + add( ret_p, f'{g_root}/src/build/{path_so_leaf_b}', to_dir) if mupdf_local: add( ret_p, f'{mupdf_build_dir}/mupdf.py', to_dir) @@ -659,8 +652,6 @@ def add( ret, from_, to_): else: add( ret, f'{g_root}/README.md', '$dist-info/README.md') - for f, t in ret: - log( f'build(): {f} => {t}') return ret @@ -963,7 +954,7 @@ def _build_extension_rebased( mupdf_local, mupdf_build_dir, build_type): path_so_leaf_b = pipcl.build_extension( name = 'extra', path_i = f'{g_root}/src/extra.i', - outdir = f'{g_root}/src', + outdir = f'{g_root}/src/build', includes = includes, defines = defines, libpaths = libpaths, From 1296fba0984fd372e1c99b4b9d76d0c31c229cf8 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Mon, 20 May 2024 18:20:07 +0100 Subject: [PATCH 3/6] src/__init__.py: patch one extra line to use _format_g(). This patches up one place that was missed in earlier commit. --- src/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index 3f307a7db..cc1e65aa1 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -8994,7 +8994,8 @@ def remove_rotation(self): # prefix with derotation matrix mat = mat0 * self.derotation_matrix - cmd = b"%g %g %g %g %g %g cm " % tuple(mat) + cmd = _format_g(tuple(mat)) + ' cm ' + cmd = cmd.encode('utf8') _ = TOOLS._insert_contents(self, cmd, False) # prepend to page contents # swap x- and y-coordinates From 35332d01e9820a39418bf8d2fa920aa14800fe44 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Mon, 20 May 2024 18:22:18 +0100 Subject: [PATCH 4/6] src/__init__.py tests/conftest.py: check no calls to log() when running tests. Also changed exception_info() to call log() instead of writing directly to _g_out_log, so that exception backtraces are also checked. --- src/__init__.py | 9 +++++++-- tests/conftest.py | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index cc1e65aa1..78d38dcd7 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -61,6 +61,8 @@ def _set_stream(name, default): _g_out_log = _set_stream('PYMUPDF_LOG', sys.stdout) _g_out_message = _set_stream('PYMUPDF_MESSAGE', sys.stdout) +# Set to list() if we are in test suite. +_g_log_items = None def log( text='', caller=1): ''' @@ -70,7 +72,10 @@ def log( text='', caller=1): filename = os.path.relpath(frame_record.filename) line = frame_record.lineno function = frame_record.function - print( f'{filename}:{line}:{function}: {text}', file=_g_out_log) + text = f'{filename}:{line}:{function}: {text}' + if _g_log_items is not None: + _g_log_items.append(text) + print(text, file=_g_out_log) _g_out_log.flush() @@ -85,7 +90,7 @@ def message(text=''): def exception_info(): import traceback log(f'exception_info:') - traceback.print_exc(file=_g_out_log) + log(traceback.format_exc()) # PDF names must not contain these characters: diff --git a/tests/conftest.py b/tests/conftest.py index 3aba1c948..76ef90ef7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,8 @@ def wrap(*args, **kwargs): assert not wt, f'{wt=}' assert not pymupdf.TOOLS.set_small_glyph_heights() + pymupdf._g_log_items = list() + # Run the test. rep = yield @@ -23,3 +25,5 @@ def wrap(*args, **kwargs): assert not wt, f'Warnings text not empty: {wt=}' assert not pymupdf.TOOLS.set_small_glyph_heights() + + assert not pymupdf._g_log_items, f'log() was called; {len(pymupdf._g_log_items)=}.' From 798991977821e2d3a8c6ee880b51d58cd63edc2c Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Mon, 20 May 2024 17:38:05 +0100 Subject: [PATCH 5/6] src/utils.py: disable various expected calls to pymupdf.exception_info(). Addresses #3479 and #3488. --- src/utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/utils.py b/src/utils.py index b4fdd43ea..f26ffe5cd 100644 --- a/src/utils.py +++ b/src/utils.py @@ -933,7 +933,7 @@ def getLinkDict(ln, document=None) -> dict: nl["from"] = ln.rect except Exception: # This seems to happen quite often in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass pnt = pymupdf.Point(0, 0) if dest.flags & pymupdf.LINK_FLAG_L_VALID: @@ -1444,46 +1444,46 @@ def set_toc( txt += ol["dest"] except Exception: # Verbose in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass try: if ol["first"] > -1: txt += "/First %i 0 R" % xref[ol["first"]] except Exception: - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass try: if ol["last"] > -1: txt += "/Last %i 0 R" % xref[ol["last"]] except Exception: - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass try: if ol["next"] > -1: txt += "/Next %i 0 R" % xref[ol["next"]] except Exception: # Verbose in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass try: if ol["parent"] > -1: txt += "/Parent %i 0 R" % xref[ol["parent"]] except Exception: # Verbose in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass try: if ol["prev"] > -1: txt += "/Prev %i 0 R" % xref[ol["prev"]] except Exception: # Verbose in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass try: txt += "/Title" + ol["title"] except Exception: # Verbose in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() pass if ol.get("color") and len(ol["color"]) == 3: @@ -4686,8 +4686,7 @@ def output_justify(start, line): try: line, tl = new_lines.pop(0) except IndexError: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose: pymupdf.exception_info() + if g_exceptions_verbose >= 2: pymupdf.exception_info() break if right_to_left: # Arabic, Hebrew From 081ca67044dd240b04ff911a42bca95f9d34ba46 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Thu, 16 May 2024 13:17:05 +0100 Subject: [PATCH 6/6] Add pymupdf.get_text() - optionally concurrent call of Page.get_text() on some/all pages. setup.py Add new _get_text.py to wheels/installs. src/__init__.py New top-level get_text() fn, calls _get_text.get(text). src/_get_text.py New, contains implementation of get_text(). tests/test_pylint.py Avoid pylint failure by disabling `R0801: Similar lines in 2 files`. tests/test_textextract.py Test get_text() and show timings. Timings for MacOS-arm64 and PDF spec: method='multiprocessing' : 3.3x. method='fork': 3.6x. --- setup.py | 11 ++- src/__init__.py | 73 +++++++++++++++++++ src/_get_text.py | 144 ++++++++++++++++++++++++++++++++++++++ tests/test_pylint.py | 5 +- tests/test_textextract.py | 37 ++++++++++ 5 files changed, 266 insertions(+), 4 deletions(-) create mode 100644 src/_get_text.py diff --git a/setup.py b/setup.py index c9e39e9aa..9a6116753 100755 --- a/setup.py +++ b/setup.py @@ -606,21 +606,26 @@ def add( ret, from_, to_): if path_so_leaf_b: # Add rebased implementation files. - add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py') # For `fitz` module alias. - add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py') # For `fitz` module alias. - add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py') # For `fitz` module alias. to_dir = 'pymupdf/' add( ret_p, f'{g_root}/src/__init__.py', to_dir) add( ret_p, f'{g_root}/src/__main__.py', to_dir) add( ret_p, f'{g_root}/src/pymupdf.py', to_dir) add( ret_p, f'{g_root}/src/table.py', to_dir) add( ret_p, f'{g_root}/src/utils.py', to_dir) + add( ret_p, f'{g_root}/src/_get_text.py', to_dir) add( ret_p, f'{g_root}/src/build/extra.py', to_dir) add( ret_p, f'{g_root}/src/build/{path_so_leaf_b}', to_dir) + # Add support for `fitz` backwards compatibility. + add( ret_p, f'{g_root}/src/fitz___init__.py', 'fitz/__init__.py') + add( ret_p, f'{g_root}/src/fitz_table.py', 'fitz/table.py') + add( ret_p, f'{g_root}/src/fitz_utils.py', 'fitz/utils.py') + if mupdf_local: + # Add MuPDF Python API. add( ret_p, f'{mupdf_build_dir}/mupdf.py', to_dir) + # Add MuPDF shared libraries. if windows: wp = pipcl.wdev.WindowsPython() add( ret_p, f'{mupdf_build_dir}/_mupdf.pyd', to_dir) diff --git a/src/__init__.py b/src/__init__.py index 78d38dcd7..e5381f2f0 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -20942,6 +20942,79 @@ def vdist(dir, a, b): return mupdf.fz_abs(dx * dir.y + dy * dir.x) +def get_text( + path, + *, + pages=None, + method='single', + concurrency=None, + + option='text', + clip=None, + flags=None, + textpage=None, + sort=False, + delimiters=None, + ): + ''' + Returns list of results from `Page.get_text()`, optionally using + concurrency for speed. + + Args: + path: + Path of document. + pages: + List of page numbers to process, or None to include all pages. + method: + 'single' + Do not use concurrency. + 'mp' + Operate concurrently using Python's `multiprocessing` module. + 'fork' + Operate concurrently using custom implementation with + `os.fork`. Does not work on Windows. + concurrency: + Number of worker processes to use when operating concurrently. If + None, we use the number of available CPUs. + option clip flags textpage sort delimiters: + Passed to internal calls to `Page.get_text()`. + ''' + args_dict = dict( + option=option, + clip=clip, + flags=flags, + textpage=textpage, + sort=sort, + delimiters=delimiters, + ) + + if method == 'single': + ret = list() + document = Document(path) + for page in document: + text = page.get_text(**args_dict) + ret.append(text) + return ret + + # Use concurrency. + # + from . import _get_text + + if pages is None: + with Document(path) as document: + num_pages = len(document) + pages = list(range(num_pages)) + + if method == 'mp': + return _get_text._get_text_mp(path, pages, concurrency, args_dict) + + elif method == 'fork': + return _get_text._get_text_fork(path, pages, concurrency, args_dict) + + else: + assert 0, f'Unrecognised {method=}.' + + class TOOLS: ''' We use @staticmethod to avoid the need to create an instance of this class. diff --git a/src/_get_text.py b/src/_get_text.py new file mode 100644 index 000000000..c98d68fde --- /dev/null +++ b/src/_get_text.py @@ -0,0 +1,144 @@ +import multiprocessing +import os + +import pymupdf + + +# Support for `method='multiprocessing'`. +# +# By default each `multiprocessing` worker process would create a `Document` +# each time it was asked to process a page. We avoid this by using a global +# `Document` instance. Haven't found a more elegant way - putting state +# into a class on the server before creating workers doesn't work because +# multiprocessing appears to always send the server's state in each iteration. +# +# It's not too bad because this global state is only ever used by workers, so +# doesn't actually limit things in general. +# +_mp_worker_path = None +_mp_worker_document = None +_mp_worker_args_dict = None + +def _mp_worker_init(path, args_dict): + global _mp_worker_path + global _mp_worker_args_dict + assert _mp_worker_path is None + assert _mp_worker_args_dict is None + _mp_worker_path = path + _mp_worker_args_dict = args_dict + +def _mp_worker(page_number): + global _mp_worker_document + if not _mp_worker_document: + _mp_worker_document = pymupdf.Document(_mp_worker_path) + page = _mp_worker_document[page_number] + ret = page.get_text(**_mp_worker_args_dict) + return ret + + +def _get_text_mp( + path, + pages, + concurrency, + args_dict, + ): + with multiprocessing.Pool( + concurrency, + _mp_worker_init, + (path, args_dict), + ) as pool: + result = pool.map_async(_mp_worker, pages) + return result.get() + + +def _get_text_fork( + path, + pages, + concurrency, + args_dict, + ): + ''' + Implementation for `method='fork'`. + ''' + verbose = 0 + if concurrency is None: + concurrency = multiprocessing.cpu_count() + # We send page numbers to queue_pc and collect (page_num, text) from + # queue_cp. Workers each repeatedly take the next available page number + # from queue_pc, extract the text and put it onto queue_cp. + # + # This is better than pre-allocating a subset of pages to each worker + # because it ensures there will never be idle workers until we are near + # the end with fewer pages left than workers. + # + queue_pc = multiprocessing.Queue() + queue_cp = multiprocessing.Queue() + + def childfn(): + document = None + while 1: + if verbose: pymupdf.log(f'{os.getpid()=}: calling get().') + page_num = queue_pc.get() + if verbose: pymupdf.log(f'{os.getpid()=}: {page_num=}.') + if page_num is None: + break + try: + if document is None: + document = pymupdf.Document(path) + page = document[page_num] + ret = page.get_text(**args_dict) + except Exception as e: + ret = e + queue_cp.put( (page_num, ret) ) + + error = None + + # Start child processes. + pids = list() + try: + for i in range(concurrency): + p = os.fork() # pylint: disable=no-member + if p == 0: + # Child process. + try: + childfn() + finally: + if verbose: pymupdf.log(f'{os.getpid()=}: calling os._exit(0)') + os._exit(0) + pids.append(p) + + # Send page numbers. + for page_num in range(len(pages)): + queue_pc.put(page_num) + + # Collect results. + ret = [None] * len(pages) + for i in range(len(pages)): + page_num, text = queue_cp.get() + if verbose: pymupdf.log(f'{page_num=} {len(text)=}') + assert ret[page_num] is None + if isinstance(text, Exception): + if not error: + error = text + break + ret[page_num] = text + + # Close queue. This should cause exception in workers and terminate + # them, but on macos-arm64 this does not seem to happen, so we also + # send None, which makes workers terminate. + for i in range(concurrency): + queue_pc.put(None) + if verbose: pymupdf.log(f'Closing queues.') + queue_pc.close() + + if error: + raise error + if verbose: pymupdf.log(f'After concurrent, returning {len(ret)=}') + return ret + + finally: + # Join all child proceses. + for pid in pids: + if verbose: pymupdf.log(f'waiting for {pid=}.') + e = os.waitpid(pid, 0) + if verbose: pymupdf.log(f'{pid=} => {e=}') diff --git a/tests/test_pylint.py b/tests/test_pylint.py index 82e4305c2..6428b5fd1 100644 --- a/tests/test_pylint.py +++ b/tests/test_pylint.py @@ -35,6 +35,7 @@ def test_pylint(): W0622: Redefining built-in 'FileNotFoundError' (redefined-builtin) W0622: Redefining built-in 'open' (redefined-builtin) W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation) + R1734: Consider using [] instead of list() (use-list-literal) ''' ) @@ -80,6 +81,7 @@ def test_pylint(): W0718: Catching too general exception Exception (broad-exception-caught) W0719: Raising too general exception: Exception (broad-exception-raised) C3001: Lambda expression assigned to a variable. Define a function using the "def" keyword instead. (unnecessary-lambda-assignment) + R0801: Similar lines in 2 files ''' ) ignores_list = list() @@ -110,6 +112,7 @@ def test_pylint(): leafs = [ '__init__.py', '__main__.py', + '_get_text.py', 'fitz___init__.py', 'fitz_table.py', 'fitz_utils.py', @@ -117,7 +120,7 @@ def test_pylint(): 'table.py', 'utils.py', ] - + leafs.sort() try: leafs_git = pipcl.git_items(directory) except Exception as e: diff --git a/tests/test_textextract.py b/tests/test_textextract.py index 22fc82917..7dc95c2cc 100644 --- a/tests/test_textextract.py +++ b/tests/test_textextract.py @@ -267,3 +267,40 @@ def test_3197(): assert text_utf8 == text_utf8_expected[i] else: assert text_utf8 != text_utf8_expected[i] + +def test_document_text(): + import platform + import time + + path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') + concurrency = None + + def llen(texts): + l = 0 + for text in texts: + l += len(text) if isinstance(text, str) else text + return l + + print('') + method = 'single' + t = time.time() + document = pymupdf.Document(path) + texts1 = pymupdf.get_text(path) + t1 = time.time() - t + print(f'{method}: {t1=} {llen(texts1)=}', flush=1) + + method = 'mp' + t = time.time() + texts2 = pymupdf.get_text(path, concurrency=concurrency, method=method) + t2 = time.time() - t + print(f'{method}: {concurrency=} {t2=} ({t1/t2:.2f}x) {llen(texts2)=}', flush=1) + assert texts2 == texts1 + + if platform.system() != 'Windows': + method = 'fork' + t = time.time() + texts3 = pymupdf.get_text(path, concurrency=concurrency, method='fork') + t3 = time.time() - t + print(f'{method}: {concurrency=} {t3=} ({t1/t3:.2f}x) {llen(texts3)=}', flush=1) + assert texts3 == texts1 +