Source code for tika.parser

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json

from .tika import ServerEndpoint, callServer, parse1


[docs] def from_file(filename, serverEndpoint=ServerEndpoint, service='all', xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False): ''' Parses a file for metadata and content :param filename: path to file which needs to be parsed or binary file using open(path,'rb') :param serverEndpoint: Server endpoint url :param service: service requested from the tika server Default is 'all', which results in recursive text content+metadata. 'meta' returns only metadata 'text' returns only content :param xmlContent: Whether or not XML content be requested. Default is 'False', which results in text content. :param headers: Request headers to be sent to the tika reset server, should be a dictionary. This is optional :return: dictionary having 'metadata' and 'content' keys. 'content' has a str value and metadata has a dict type value. ''' if not xmlContent: output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions) else: output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'}, headers=headers, config_path=config_path, requestOptions=requestOptions) if raw_response: return output else: return _parse(output, service)
[docs] def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None, requestOptions={}, raw_response=False): ''' Parses the content from buffer :param string: Buffer value :param serverEndpoint: Server endpoint. This is optional :param xmlContent: Whether or not XML content be requested. Default is 'False', which results in text content. :param headers: Request headers to be sent to the tika reset server, should be a dictionary. This is optional :return: ''' headers = headers or {} headers.update({'Accept': 'application/json'}) if not xmlContent: status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False, config_path=config_path, requestOptions=requestOptions) else: status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False, config_path=config_path, requestOptions=requestOptions) if raw_response: return (status, response) else: return _parse((status,response))
def _parse(output, service='all'): ''' Parses response from Tika REST API server :param output: output from Tika Server :param service: service requested from the tika server Default is 'all', which results in recursive text content+metadata. 'meta' returns only metadata 'text' returns only content :return: a dictionary having 'metadata' and 'content' values ''' parsed={'metadata': None, 'content': None} if not output: return parsed parsed["status"] = output[0] if output[1] is None or output[1] == "": return parsed if service == "text": parsed["content"] = output[1] return parsed realJson = json.loads(output[1]) parsed["metadata"] = {} if service == "meta": for key in realJson: parsed["metadata"][key] = realJson[key] return parsed content = "" for js in realJson: if "X-TIKA:content" in js: content += js["X-TIKA:content"] if content == "": content = None parsed["content"] = content for js in realJson: for n in js: if n != "X-TIKA:content": if n in parsed["metadata"]: if not isinstance(parsed["metadata"][n], list): parsed["metadata"][n] = [parsed["metadata"][n]] parsed["metadata"][n].append(js[n]) else: parsed["metadata"][n] = js[n] return parsed