Adopt to new text-extractor gem

pull/6038/head
Wieland Lindenthal 7 years ago
parent e15379d3a7
commit e9bcdb44a1
  1. 8
      app/workers/extract_fulltext_job.rb
  2. 334
      lib/open_project/text_extractor.rb
  3. 126
      spec/lib/open_project/text_extractor_spec.rb

@ -1,3 +1,5 @@
require 'text_extractor'
class ExtractFulltextJob < ApplicationJob
# queue_as :text_extraction
@ -6,9 +8,9 @@ class ExtractFulltextJob < ApplicationJob
end
def perform
if attachment = find_attachment(@attachment_id) and
attachment.readable? and
text = OpenProject::TextExtractor.new(attachment).text
if (attachment = find_attachment(@attachment_id) and
attachment.readable? and
text = TextExtractor::Resolver.new(attachment.diskfile, attachment.content_type).text)
attachment.update_column :fulltext, text
end

@ -1,334 +0,0 @@
#-- encoding: UTF-8
#-- copyright
# OpenProject is a project management system.
# Copyright (C) 2012-2017 the OpenProject Foundation (OPF)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License version 3.
#
# OpenProject is a fork of ChiliProject, which is a fork of Redmine. The copyright follows:
# Copyright (C) 2006-2017 Jean-Philippe Lang
# Copyright (C) 2010-2013 the ChiliProject Team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# See doc/COPYRIGHT.rdoc for more details.
#++
# Special thanks goes to Jens Kraemer (<jk@jkraemer.net>)
# who initially provided this code for Redmine while working on PLANIO.
module OpenProject
class TextExtractor
MAX_FULLTEXT_LENGTH = 4.megabytes
TEXT_EXTRACTORS = OpenProject::Configuration['text_extractors'] || {}
def initialize(attachment)
@attachment = attachment
end
# returns the extracted fulltext or nil if no matching handler was found
# for the file type.
def text
Rails.logger.debug "TextExtractor text: Attachment"
if handler = find_handler and text = handler.text(@attachment)
text.gsub! /\s+/m, ' '
text.strip!
text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
end
rescue Exception => e
Rails.logger.error "error in fulltext extraction: #{e}"
raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc
end
private
def find_handler
@@file_handlers.detect{|h| h.accept? @attachment }
end
class FileHandler
def accept?(attachment)
if @content_type
attachment.content_type == @content_type
elsif @content_types
@content_types.include? attachment.content_type
else
false
end
end
end
class ExternalCommandHandler < FileHandler
# TODO: Extract this to a proper module
# Executes the given command through IO.popen and yields an IO object
# representing STDIN / STDOUT
#
# Due to how popen works the command will be executed directly without
# involving the shell if cmd is an array.
require 'fileutils'
def shellout(cmd, options = {}, &block)
mode = "r+"
IO.popen(cmd, mode) do |io|
io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding)
io.close_write unless options[:write_stdin]
block.call(io) if block_given?
end
end
FILE_PLACEHOLDER = '__FILE__'.freeze
def text(attachment)
cmd = @command.dup
cmd[cmd.index(FILE_PLACEHOLDER)] = attachment.diskfile.path
shellout(cmd){ |io| io.read }.to_s
end
def accept?(attachment)
super and available?
end
def available?
@command.present? and File.executable?(@command[0])
end
def self.available?
new.available?
end
end
class PdfHandler < ExternalCommandHandler
DEFAULT = [
'/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
].freeze
def initialize
@content_type = 'application/pdf'
@command = TEXT_EXTRACTORS['pdftotext'] || DEFAULT
end
end
class RtfHandler < ExternalCommandHandler
DEFAULT = [
'/usr/bin/unrtf', '--text', '__FILE__'
].freeze
def initialize
@content_type = 'application/rtf'
@command = TEXT_EXTRACTORS['unrtf'] || DEFAULT
end
end
# Handler base class for XML based (MS / Open / Libre) office documents.
class ZippedXmlHandler < FileHandler
require 'zip'
class SaxDocument < Nokogiri::XML::SAX::Document
attr_reader :text
def initialize(text_element, text_namespace)
@element = text_element
@namespace_uri = text_namespace
@text = ''.dup
@is_text = false
end
# Handle each element, expecting the name and any attributes
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
if name == @element and uri == @namespace_uri
@is_text = true
end
end
# Any characters between the start and end element expected as a string
def characters(string)
@text << string if @is_text
end
# Given the name of an element once its closing tag is reached
def end_element_namespace(name, prefix = nil, uri = nil)
if name == @element and uri == @namespace_uri
@text << ' '
@is_text = false
end
end
end
def text(attachment)
Zip::File.open(attachment.diskfile) do |zip_file|
zip_file.each do |entry|
if entry.name == @file
return xml_to_text entry.get_input_stream
end
end
end
end
private
def xml_to_text(io)
sax_doc = SaxDocument.new @element, @namespace_uri
Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
sax_doc.text
end
end
# Base class for extractors for MS Office formats
class OfficeDocumentHandler < ZippedXmlHandler
def initialize
super
@element = 't'
end
end
class DocxHandler < OfficeDocumentHandler
def initialize
super
@content_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
@file = 'word/document.xml'
@namespace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
end
end
class XlsxHandler < OfficeDocumentHandler
def initialize
super
@content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
@file = 'xl/sharedStrings.xml'
@namespace_uri = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
end
end
class PptxHandler < OfficeDocumentHandler
CONTENT_TYPES = [
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
'application/vnd.ms-powerpoint.template.macroEnabled.12'
]
def initialize
super
@content_types = CONTENT_TYPES
@namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
end
def text(attachment)
slides = []
Zip::File.open(attachment.diskfile) do |zip_file|
zip_file.each do |entry|
if entry.name =~ /slide(\d+)\.xml/
slides << [$1, xml_to_text(entry.get_input_stream)]
end
end
end
slides.sort!{|a, b| a.first <=> b.first}
slides.map(&:last).join ' '
end
end
# Extractor for Open / Libre Office formats
class OpendocumentHandler < ZippedXmlHandler
CONTENT_TYPES = [
'application/vnd.oasis.opendocument.presentation',
'application/vnd.oasis.opendocument.presentation-template',
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.text-template',
'application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.spreadsheet-template'
]
def initialize
super
@file = 'content.xml'
@content_types = CONTENT_TYPES
@element = 'p'
@namespace_uri = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
end
end
class DocHandler < ExternalCommandHandler
CONTENT_TYPES = [
'application/vnd.ms-word',
'application/msword'
]
DEFAULT = [
'/usr/bin/catdoc', '-dutf-8', '__FILE__'
]
def initialize
@content_types = CONTENT_TYPES
@command = TEXT_EXTRACTORS['catdoc'] || DEFAULT
end
end
class XlsHandler < ExternalCommandHandler
CONTENT_TYPES = [
'application/vnd.ms-excel',
'application/excel'
]
DEFAULT = [
'/usr/bin/xls2csv', '-dutf-8', '__FILE__'
]
def initialize
@content_types = CONTENT_TYPES
@command = TEXT_EXTRACTORS['xls2csv'] || DEFAULT
end
def text(*_)
if str = super
str.delete('"').gsub /,+/, ' '
end
end
end
class PptHandler < ExternalCommandHandler
CONTENT_TYPES = [
'application/vnd.ms-powerpoint',
'application/powerpoint',
]
DEFAULT = [
'/usr/bin/catppt', '-dutf-8', '__FILE__'
]
def initialize
@content_types = CONTENT_TYPES
@command = TEXT_EXTRACTORS['catppt'] || DEFAULT
end
end
class PlaintextHandler < FileHandler
CONTENT_TYPES = %w(text/csv text/plain)
def initialize
@content_types = CONTENT_TYPES
end
def text(attachment)
Redmine::CodesetUtil.to_utf8 IO.read(attachment.diskfile), 'UTF-8'
end
end
# the handler chain. List most specific handlers first and more general
# (fallback) handlers later.
@@file_handlers = [
PdfHandler,
OpendocumentHandler,
DocxHandler, XlsxHandler, PptxHandler,
DocHandler, XlsHandler, PptHandler,
RtfHandler,
PlaintextHandler
].map(&:new)
end
end

@ -1,126 +0,0 @@
#-- copyright
# OpenProject is a project management system.
# Copyright (C) 2012-2017 the OpenProject Foundation (OPF)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License version 3.
#
# OpenProject is a fork of ChiliProject, which is a fork of Redmine. The copyright follows:
# Copyright (C) 2006-2017 Jean-Philippe Lang
# Copyright (C) 2010-2013 the ChiliProject Team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# See doc/COPYRIGHT.rdoc for more details.
#++
require 'spec_helper'
# TODO: Rewrite these tests to specs.
describe OpenProject::TextExtractor do
# fixtures :projects, :users, :attachments
#
# setup do
# @project = Project.find_by_identifier 'ecookbook'
# set_fixtures_attachments_directory
# @dlopper = User.find_by_login 'dlopper'
# end
#
# def attachment_for(filename, content_type = nil)
# Attachment.new(container: @project,
# file: uploaded_test_file(filename, content_type),
# filename: filename,
# author: @dlopper).tap do |a|
# a.content_type = content_type if content_type
# a.save!
# end
# end
#
# if Redmine::TextExtractor::PdfHandler.available?
# test "should extract text from pdf" do
# a = attachment_for "text.pdf"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text
# end
# end
#
# if Redmine::TextExtractor::RtfHandler.available?
# test "should extract text from rtf" do
# a = attachment_for "text.rtf"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text
# end
# end
#
# if Redmine::TextExtractor::DocHandler.available?
# test "should extract text from doc" do
# a = attachment_for "text.doc"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text
# end
# end
#
# if Redmine::TextExtractor::XlsHandler.available?
# test "should extract text from xls" do
# a = attachment_for "spreadsheet.xls"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text
# end
# end
#
#
# %w(txt docx odt ott).each do |type|
# test "should extract text from #{type}" do
# a = attachment_for "text.#{type}"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text
# end
# end
#
#
# %w(xlsx ods ots).each do |type|
# test "should extract text from #{type}" do
# a = attachment_for "spreadsheet.#{type}"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text
# end
# end
#
#
# %w(pptx ppsx potm odp otp).each do |type|
# test "should extract text from #{type}" do
# a = attachment_for "presentation.#{type}"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_equal 'The Title find me Slide two Click To Add Text', text
# end
# end
#
#
# test "should extract text from csv" do
# a = attachment_for "spreadsheet.csv"
# te = Redmine::TextExtractor.new a
# assert text = te.text
# assert_match /lorem ipsum fulltext find me!/, text.gsub(/(,+|\n+\s*)/m, ' ').squeeze(' ')
# end
#
# end
#
end
Loading…
Cancel
Save