diff --git a/app/workers/extract_fulltext_job.rb b/app/workers/extract_fulltext_job.rb index 96eafde45a..65586c90ec 100644 --- a/app/workers/extract_fulltext_job.rb +++ b/app/workers/extract_fulltext_job.rb @@ -1,3 +1,5 @@ +require 'text_extractor' + class ExtractFulltextJob < ApplicationJob # queue_as :text_extraction @@ -6,9 +8,9 @@ class ExtractFulltextJob < ApplicationJob end def perform - if attachment = find_attachment(@attachment_id) and - attachment.readable? and - text = OpenProject::TextExtractor.new(attachment).text + if (attachment = find_attachment(@attachment_id) and + attachment.readable? and + text = TextExtractor::Resolver.new(attachment.diskfile, attachment.content_type).text) attachment.update_column :fulltext, text end diff --git a/lib/open_project/text_extractor.rb b/lib/open_project/text_extractor.rb deleted file mode 100644 index 43330ba7a9..0000000000 --- a/lib/open_project/text_extractor.rb +++ /dev/null @@ -1,334 +0,0 @@ -#-- encoding: UTF-8 -#-- copyright -# OpenProject is a project management system. -# Copyright (C) 2012-2017 the OpenProject Foundation (OPF) -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License version 3. -# -# OpenProject is a fork of ChiliProject, which is a fork of Redmine. The copyright follows: -# Copyright (C) 2006-2017 Jean-Philippe Lang -# Copyright (C) 2010-2013 the ChiliProject Team -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# See doc/COPYRIGHT.rdoc for more details. -#++ - -# Special thanks goes to Jens Kraemer () -# who initially provided this code for Redmine while working on PLANIO. - -module OpenProject - class TextExtractor - - MAX_FULLTEXT_LENGTH = 4.megabytes - TEXT_EXTRACTORS = OpenProject::Configuration['text_extractors'] || {} - - def initialize(attachment) - @attachment = attachment - end - - # returns the extracted fulltext or nil if no matching handler was found - # for the file type. - def text - Rails.logger.debug "TextExtractor text: Attachment" - if handler = find_handler and text = handler.text(@attachment) - text.gsub! /\s+/m, ' ' - text.strip! - text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s - end - rescue Exception => e - Rails.logger.error "error in fulltext extraction: #{e}" - raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc - end - - private - - def find_handler - @@file_handlers.detect{|h| h.accept? @attachment } - end - - class FileHandler - def accept?(attachment) - if @content_type - attachment.content_type == @content_type - elsif @content_types - @content_types.include? attachment.content_type - else - false - end - end - end - - class ExternalCommandHandler < FileHandler - # TODO: Extract this to a proper module - # Executes the given command through IO.popen and yields an IO object - # representing STDIN / STDOUT - # - # Due to how popen works the command will be executed directly without - # involving the shell if cmd is an array. - require 'fileutils' - def shellout(cmd, options = {}, &block) - mode = "r+" - IO.popen(cmd, mode) do |io| - io.set_encoding("ASCII-8BIT") if io.respond_to?(:set_encoding) - io.close_write unless options[:write_stdin] - block.call(io) if block_given? - end - end - - FILE_PLACEHOLDER = '__FILE__'.freeze - - def text(attachment) - cmd = @command.dup - cmd[cmd.index(FILE_PLACEHOLDER)] = attachment.diskfile.path - shellout(cmd){ |io| io.read }.to_s - end - - def accept?(attachment) - super and available? - end - - def available? - @command.present? and File.executable?(@command[0]) - end - - def self.available? - new.available? - end - end - - - class PdfHandler < ExternalCommandHandler - DEFAULT = [ - '/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-' - ].freeze - def initialize - @content_type = 'application/pdf' - @command = TEXT_EXTRACTORS['pdftotext'] || DEFAULT - end - end - - - class RtfHandler < ExternalCommandHandler - DEFAULT = [ - '/usr/bin/unrtf', '--text', '__FILE__' - ].freeze - def initialize - @content_type = 'application/rtf' - @command = TEXT_EXTRACTORS['unrtf'] || DEFAULT - end - end - - - # Handler base class for XML based (MS / Open / Libre) office documents. - class ZippedXmlHandler < FileHandler - require 'zip' - - class SaxDocument < Nokogiri::XML::SAX::Document - attr_reader :text - - def initialize(text_element, text_namespace) - @element = text_element - @namespace_uri = text_namespace - @text = ''.dup - @is_text = false - end - - # Handle each element, expecting the name and any attributes - def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) - if name == @element and uri == @namespace_uri - @is_text = true - end - end - - # Any characters between the start and end element expected as a string - def characters(string) - @text << string if @is_text - end - - # Given the name of an element once its closing tag is reached - def end_element_namespace(name, prefix = nil, uri = nil) - if name == @element and uri == @namespace_uri - @text << ' ' - @is_text = false - end - end - end - - def text(attachment) - Zip::File.open(attachment.diskfile) do |zip_file| - zip_file.each do |entry| - if entry.name == @file - return xml_to_text entry.get_input_stream - end - end - end - end - - private - - def xml_to_text(io) - sax_doc = SaxDocument.new @element, @namespace_uri - Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io) - sax_doc.text - end - end - - # Base class for extractors for MS Office formats - class OfficeDocumentHandler < ZippedXmlHandler - def initialize - super - @element = 't' - end - end - - class DocxHandler < OfficeDocumentHandler - def initialize - super - @content_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' - @file = 'word/document.xml' - @namespace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' - end - end - - - class XlsxHandler < OfficeDocumentHandler - def initialize - super - @content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' - @file = 'xl/sharedStrings.xml' - @namespace_uri = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' - end - end - - - - class PptxHandler < OfficeDocumentHandler - CONTENT_TYPES = [ - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', - 'application/vnd.ms-powerpoint.template.macroEnabled.12' - ] - - def initialize - super - @content_types = CONTENT_TYPES - @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main' - end - - def text(attachment) - slides = [] - Zip::File.open(attachment.diskfile) do |zip_file| - zip_file.each do |entry| - if entry.name =~ /slide(\d+)\.xml/ - slides << [$1, xml_to_text(entry.get_input_stream)] - end - end - end - slides.sort!{|a, b| a.first <=> b.first} - slides.map(&:last).join ' ' - end - end - - - # Extractor for Open / Libre Office formats - class OpendocumentHandler < ZippedXmlHandler - CONTENT_TYPES = [ - 'application/vnd.oasis.opendocument.presentation', - 'application/vnd.oasis.opendocument.presentation-template', - 'application/vnd.oasis.opendocument.text', - 'application/vnd.oasis.opendocument.text-template', - 'application/vnd.oasis.opendocument.spreadsheet', - 'application/vnd.oasis.opendocument.spreadsheet-template' - ] - def initialize - super - @file = 'content.xml' - @content_types = CONTENT_TYPES - @element = 'p' - @namespace_uri = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0' - end - end - - class DocHandler < ExternalCommandHandler - CONTENT_TYPES = [ - 'application/vnd.ms-word', - 'application/msword' - ] - DEFAULT = [ - '/usr/bin/catdoc', '-dutf-8', '__FILE__' - ] - def initialize - @content_types = CONTENT_TYPES - @command = TEXT_EXTRACTORS['catdoc'] || DEFAULT - end - end - - class XlsHandler < ExternalCommandHandler - CONTENT_TYPES = [ - 'application/vnd.ms-excel', - 'application/excel' - ] - DEFAULT = [ - '/usr/bin/xls2csv', '-dutf-8', '__FILE__' - ] - def initialize - @content_types = CONTENT_TYPES - @command = TEXT_EXTRACTORS['xls2csv'] || DEFAULT - end - def text(*_) - if str = super - str.delete('"').gsub /,+/, ' ' - end - end - end - - class PptHandler < ExternalCommandHandler - CONTENT_TYPES = [ - 'application/vnd.ms-powerpoint', - 'application/powerpoint', - ] - DEFAULT = [ - '/usr/bin/catppt', '-dutf-8', '__FILE__' - ] - def initialize - @content_types = CONTENT_TYPES - @command = TEXT_EXTRACTORS['catppt'] || DEFAULT - end - end - - class PlaintextHandler < FileHandler - CONTENT_TYPES = %w(text/csv text/plain) - def initialize - @content_types = CONTENT_TYPES - end - def text(attachment) - Redmine::CodesetUtil.to_utf8 IO.read(attachment.diskfile), 'UTF-8' - end - end - - # the handler chain. List most specific handlers first and more general - # (fallback) handlers later. - @@file_handlers = [ - PdfHandler, - OpendocumentHandler, - DocxHandler, XlsxHandler, PptxHandler, - DocHandler, XlsHandler, PptHandler, - RtfHandler, - PlaintextHandler - ].map(&:new) - end -end diff --git a/spec/lib/open_project/text_extractor_spec.rb b/spec/lib/open_project/text_extractor_spec.rb deleted file mode 100644 index a4854beb1e..0000000000 --- a/spec/lib/open_project/text_extractor_spec.rb +++ /dev/null @@ -1,126 +0,0 @@ -#-- copyright -# OpenProject is a project management system. -# Copyright (C) 2012-2017 the OpenProject Foundation (OPF) -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License version 3. -# -# OpenProject is a fork of ChiliProject, which is a fork of Redmine. The copyright follows: -# Copyright (C) 2006-2017 Jean-Philippe Lang -# Copyright (C) 2010-2013 the ChiliProject Team -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# See doc/COPYRIGHT.rdoc for more details. -#++ - -require 'spec_helper' -# TODO: Rewrite these tests to specs. -describe OpenProject::TextExtractor do - # fixtures :projects, :users, :attachments - # - # setup do - # @project = Project.find_by_identifier 'ecookbook' - # set_fixtures_attachments_directory - # @dlopper = User.find_by_login 'dlopper' - # end - # - # def attachment_for(filename, content_type = nil) - # Attachment.new(container: @project, - # file: uploaded_test_file(filename, content_type), - # filename: filename, - # author: @dlopper).tap do |a| - # a.content_type = content_type if content_type - # a.save! - # end - # end - # - # if Redmine::TextExtractor::PdfHandler.available? - # test "should extract text from pdf" do - # a = attachment_for "text.pdf" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text - # end - # end - # - # if Redmine::TextExtractor::RtfHandler.available? - # test "should extract text from rtf" do - # a = attachment_for "text.rtf" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text - # end - # end - # - # if Redmine::TextExtractor::DocHandler.available? - # test "should extract text from doc" do - # a = attachment_for "text.doc" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text - # end - # end - # - # if Redmine::TextExtractor::XlsHandler.available? - # test "should extract text from xls" do - # a = attachment_for "spreadsheet.xls" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text - # end - # end - # - # - # %w(txt docx odt ott).each do |type| - # test "should extract text from #{type}" do - # a = attachment_for "text.#{type}" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text - # end - # end - # - # - # %w(xlsx ods ots).each do |type| - # test "should extract text from #{type}" do - # a = attachment_for "spreadsheet.#{type}" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text - # end - # end - # - # - # %w(pptx ppsx potm odp otp).each do |type| - # test "should extract text from #{type}" do - # a = attachment_for "presentation.#{type}" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_equal 'The Title find me Slide two Click To Add Text', text - # end - # end - # - # - # test "should extract text from csv" do - # a = attachment_for "spreadsheet.csv" - # te = Redmine::TextExtractor.new a - # assert text = te.text - # assert_match /lorem ipsum fulltext find me!/, text.gsub(/(,+|\n+\s*)/m, ' ').squeeze(' ') - # end - # - # end - # -end