Source code for tika.unpack

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import csv
import tarfile
from contextlib import closing
from io import BytesIO, TextIOWrapper

from .tika import ServerEndpoint, callServer, parse1

_text_wrapper = TextIOWrapper


[docs] def from_file(filename, serverEndpoint=ServerEndpoint, requestOptions={}): ''' Parse from file :param filename: file :param serverEndpoint: Tika server end point (optional) :return: ''' tarOutput = parse1('unpack', filename, serverEndpoint, responseMimeType='application/x-tar', services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml', 'unpack': '/unpack/all'}, rawResponse=True, requestOptions=requestOptions) return _parse(tarOutput)
[docs] def from_buffer(string, serverEndpoint=ServerEndpoint, headers=None, requestOptions={}): ''' Parse from buffered content :param string: buffered content :param serverEndpoint: Tika server URL (Optional) :return: parsed content ''' headers = headers or {} headers.update({'Accept': 'application/x-tar'}) status, response = callServer('put', serverEndpoint, '/unpack/all', string, headers, False, rawResponse=True, requestOptions=requestOptions) return _parse((status, response))
def _parse(tarOutput): parsed = {} if not tarOutput: return parsed elif tarOutput[1] is None or tarOutput[1] == b"": return parsed with tarfile.open(fileobj=BytesIO(tarOutput[1])) as tarFile: # get the member names memberNames = list(tarFile.getnames()) # extract the metadata metadata = {} if "__METADATA__" in memberNames: memberNames.remove("__METADATA__") metadataMember = tarFile.getmember("__METADATA__") if not metadataMember.issym() and metadataMember.isfile(): with closing(_text_wrapper(tarFile.extractfile(metadataMember), encoding=tarFile.encoding)) as metadataFile: metadataReader = csv.reader(_truncate_nulls(metadataFile)) for metadataLine in metadataReader: # each metadata line comes as a key-value pair, with list values # returned as extra values in the line - convert single values # to non-list values to be consistent with parser metadata assert len(metadataLine) >= 2 if len(metadataLine) > 2: metadata[metadataLine[0]] = metadataLine[1:] else: metadata[metadataLine[0]] = metadataLine[1] # get the content content = "" if "__TEXT__" in memberNames: memberNames.remove("__TEXT__") contentMember = tarFile.getmember("__TEXT__") if not contentMember.issym() and contentMember.isfile(): with closing(_text_wrapper(tarFile.extractfile(contentMember), encoding='utf8')) as content_file: content = content_file.read() # get the remaining files as attachments attachments = {} for attachment in memberNames: attachmentMember = tarFile.getmember(attachment) if not attachmentMember.issym() and attachmentMember.isfile(): with closing(tarFile.extractfile(attachmentMember)) as attachment_file: attachments[attachment] = attachment_file.read() parsed["content"] = content parsed["metadata"] = metadata parsed["attachments"] = attachments return parsed # TODO: Remove if/when fixed. https://issues.apache.org/jira/browse/TIKA-3070 def _truncate_nulls(s): for line in s: yield line.replace('\0', '')