Split Markdown rendering & reference gathering

This splits the Markdown rendering and reference extraction phases into
two distinct code bases. The reference extraction phase no longer relies
on the html-pipeline Gem (and any related code) and allows for
extracting of references from multiple HTML nodes in a single pass. This
means that if you want to extract user references from 200 comments you
no longer need to run 200 times N number of queries, instead only a
handful of queries may be needed.
parent 94d5416d
......@@ -91,8 +91,8 @@ class Projects::WikisController < Projects::ApplicationController
def markdown_preview
text = params[:text]
ext = Gitlab::ReferenceExtractor.new(@project, current_user, current_user)
ext.analyze(text)
ext = Gitlab::ReferenceExtractor.new(@project, current_user)
ext.analyze(text, author: current_user)
render json: {
body: view_context.markdown(text, pipeline: :wiki, project_wiki: @project_wiki),
......
......@@ -197,8 +197,8 @@ class ProjectsController < Projects::ApplicationController
def markdown_preview
text = params[:text]
ext = Gitlab::ReferenceExtractor.new(@project, current_user, current_user)
ext.analyze(text)
ext = Gitlab::ReferenceExtractor.new(@project, current_user)
ext.analyze(text, author: current_user)
render json: {
body: view_context.markdown(text),
......
......@@ -18,10 +18,6 @@ module Banzai
@object_sym ||= object_name.to_sym
end
def self.data_reference
@data_reference ||= "data-#{object_name.dasherize}"
end
def self.object_class_title
@object_title ||= object_class.name.titleize
end
......@@ -45,10 +41,6 @@ module Banzai
end
end
def self.referenced_by(node)
{ object_sym => LazyReference.new(object_class, node.attr(data_reference)) }
end
def object_class
self.class.object_class
end
......
......@@ -4,6 +4,8 @@ module Banzai
#
# This filter supports cross-project references.
class CommitRangeReferenceFilter < AbstractReferenceFilter
self.reference_type = :commit_range
def self.object_class
CommitRange
end
......@@ -14,34 +16,18 @@ module Banzai
end
end
def self.referenced_by(node)
project = Project.find(node.attr("data-project")) rescue nil
return unless project
id = node.attr("data-commit-range")
range = find_object(project, id)
return unless range
{ commit_range: range }
end
def initialize(*args)
super
@commit_map = {}
end
def self.find_object(project, id)
def find_object(project, id)
range = CommitRange.new(id, project)
range.valid_commits? ? range : nil
end
def find_object(*args)
self.class.find_object(*args)
end
def url_for_object(range, project)
h = Gitlab::Routing.url_helpers
h.namespace_project_compare_url(project.namespace, project,
......
......@@ -4,6 +4,8 @@ module Banzai
#
# This filter supports cross-project references.
class CommitReferenceFilter < AbstractReferenceFilter
self.reference_type = :commit
def self.object_class
Commit
end
......@@ -14,28 +16,12 @@ module Banzai
end
end
def self.referenced_by(node)
project = Project.find(node.attr("data-project")) rescue nil
return unless project
id = node.attr("data-commit")
commit = find_object(project, id)
return unless commit
{ commit: commit }
end
def self.find_object(project, id)
def find_object(project, id)
if project && project.valid_repo?
project.commit(id)
end
end
def find_object(*args)
self.class.find_object(*args)
end
def url_for_object(commit, project)
h = Gitlab::Routing.url_helpers
h.namespace_project_commit_url(project.namespace, project, commit,
......
......@@ -4,6 +4,8 @@ module Banzai
# References are ignored if the project doesn't use an external issue
# tracker.
class ExternalIssueReferenceFilter < ReferenceFilter
self.reference_type = :external_issue
# Public: Find `JIRA-123` issue references in text
#
# ExternalIssueReferenceFilter.references_in(text) do |match, issue|
......@@ -21,18 +23,6 @@ module Banzai
end
end
def self.referenced_by(node)
project = Project.find(node.attr("data-project")) rescue nil
return unless project
id = node.attr("data-external-issue")
external_issue = ExternalIssue.new(id, project)
return unless external_issue
{ external_issue: external_issue }
end
def call
# Early return if the project isn't using an external tracker
return doc if project.nil? || default_issues_tracker?
......
......@@ -5,18 +5,12 @@ module Banzai
#
# This filter supports cross-project references.
class IssueReferenceFilter < AbstractReferenceFilter
self.reference_type = :issue
def self.object_class
Issue
end
def self.user_can_see_reference?(user, node, context)
# It is not possible to check access rights for external issue trackers
return true if context[:project].try(:external_issue_tracker)
issue = Issue.find(node.attr('data-issue')) rescue nil
Ability.abilities.allowed?(user, :read_issue, issue)
end
def find_object(project, id)
project.get_issue(id)
end
......
......@@ -2,6 +2,8 @@ module Banzai
module Filter
# HTML filter that replaces label references with links.
class LabelReferenceFilter < AbstractReferenceFilter
self.reference_type = :label
def self.object_class
Label
end
......
......@@ -5,6 +5,8 @@ module Banzai
#
# This filter supports cross-project references.
class MergeRequestReferenceFilter < AbstractReferenceFilter
self.reference_type = :merge_request
def self.object_class
MergeRequest
end
......
......@@ -2,6 +2,8 @@ module Banzai
module Filter
# HTML filter that replaces milestone references with links.
class MilestoneReferenceFilter < AbstractReferenceFilter
self.reference_type = :milestone
def self.object_class
Milestone
end
......
......@@ -7,8 +7,11 @@ module Banzai
#
class RedactorFilter < HTML::Pipeline::Filter
def call
Querying.css(doc, 'a.gfm').each do |node|
unless user_can_see_reference?(node)
nodes = Querying.css(doc, 'a.gfm[data-reference-type]')
visible = nodes_visible_to_user(nodes)
nodes.each do |node|
unless visible.include?(node)
# The reference should be replaced by the original text,
# which is not always the same as the rendered text.
text = node.attr('data-original') || node.text
......@@ -21,20 +24,30 @@ module Banzai
private
def user_can_see_reference?(node)
if node.has_attribute?('data-reference-filter')
reference_type = node.attr('data-reference-filter')
reference_filter = Banzai::Filter.const_get(reference_type)
def nodes_visible_to_user(nodes)
per_type = Hash.new { |h, k| h[k] = [] }
visible = Set.new
nodes.each do |node|
per_type[node.attr('data-reference-type')] << node
end
per_type.each do |type, nodes|
parser = Banzai::ReferenceParser[type].new(project, current_user)
reference_filter.user_can_see_reference?(current_user, node, context)
else
true
visible.merge(parser.nodes_visible_to_user(current_user, nodes))
end
visible
end
def current_user
context[:current_user]
end
def project
context[:project]
end
end
end
end
......@@ -8,24 +8,8 @@ module Banzai
# :project (required) - Current project, ignored if reference is cross-project.
# :only_path - Generate path-only links.
class ReferenceFilter < HTML::Pipeline::Filter
def self.user_can_see_reference?(user, node, context)
if node.has_attribute?('data-project')
project_id = node.attr('data-project').to_i
return true if project_id == context[:project].try(:id)
project = Project.find(project_id) rescue nil
Ability.abilities.allowed?(user, :read_project, project)
else
true
end
end
def self.user_can_reference?(user, node, context)
true
end
def self.referenced_by(node)
raise NotImplementedError, "#{self} does not implement #{__method__}"
class << self
attr_accessor :reference_type
end
# Returns a data attribute String to attach to a reference link
......@@ -43,7 +27,9 @@ module Banzai
#
# Returns a String
def data_attribute(attributes = {})
attributes[:reference_filter] = self.class.name.demodulize
attributes = attributes.reject { |_, v| v.nil? }
attributes[:reference_type] = self.class.reference_type
attributes.delete(:original) if context[:no_original_data]
attributes.map { |key, value| %Q(data-#{key.to_s.dasherize}="#{escape_once(value)}") }.join(" ")
end
......
module Banzai
module Filter
# HTML filter that gathers all referenced records that the current user has
# permission to view.
#
# Expected to be run in its own post-processing pipeline.
#
class ReferenceGathererFilter < HTML::Pipeline::Filter
def initialize(*)
super
result[:references] ||= Hash.new { |hash, type| hash[type] = [] }
end
def call
Querying.css(doc, 'a.gfm').each do |node|
gather_references(node)
end
load_lazy_references unless ReferenceExtractor.lazy?
doc
end
private
def gather_references(node)
return unless node.has_attribute?('data-reference-filter')
reference_type = node.attr('data-reference-filter')
reference_filter = Banzai::Filter.const_get(reference_type)
return if context[:reference_filter] && reference_filter != context[:reference_filter]
return if author && !reference_filter.user_can_reference?(author, node, context)
return unless reference_filter.user_can_see_reference?(current_user, node, context)
references = reference_filter.referenced_by(node)
return unless references
references.each do |type, values|
Array.wrap(values).each do |value|
result[:references][type] << value
end
end
end
def load_lazy_references
refs = result[:references]
refs.each do |type, values|
refs[type] = ReferenceExtractor.lazily(values)
end
end
def current_user
context[:current_user]
end
def author
context[:author]
end
end
end
end
......@@ -5,6 +5,8 @@ module Banzai
#
# This filter supports cross-project references.
class SnippetReferenceFilter < AbstractReferenceFilter
self.reference_type = :snippet
def self.object_class
Snippet
end
......
......@@ -4,6 +4,8 @@ module Banzai
#
# A special `@all` reference is also supported.
class UserReferenceFilter < ReferenceFilter
self.reference_type = :user
# Public: Find `@user` user references in text
#
# UserReferenceFilter.references_in(text) do |match, username|
......@@ -21,43 +23,6 @@ module Banzai
end
end
def self.referenced_by(node)
if node.has_attribute?('data-group')
group = Group.find(node.attr('data-group')) rescue nil
return unless group
{ user: group.users }
elsif node.has_attribute?('data-user')
{ user: LazyReference.new(User, node.attr('data-user')) }
elsif node.has_attribute?('data-project')
project = Project.find(node.attr('data-project')) rescue nil
return unless project
{ user: project.team.members.flatten }
end
end
def self.user_can_see_reference?(user, node, context)
if node.has_attribute?('data-group')
group = Group.find(node.attr('data-group')) rescue nil
Ability.abilities.allowed?(user, :read_group, group)
else
super
end
end
def self.user_can_reference?(user, node, context)
# Only team members can reference `@all`
if node.has_attribute?('data-project')
project = Project.find(node.attr('data-project')) rescue nil
return false unless project
user && project.team.member?(user)
else
super
end
end
def call
return doc if project.nil?
......@@ -114,9 +79,12 @@ module Banzai
def link_to_all(link_text: nil)
project = context[:project]
author = context[:author]
url = urls.namespace_project_url(project.namespace, project,
only_path: context[:only_path])
data = data_attribute(project: project.id)
data = data_attribute(project: project.id, author: author.try(:id))
text = link_text || User.reference_prefix + 'all'
link_tag(url, data, text)
......
module Banzai
class LazyReference
def self.load(refs)
lazy_references, values = refs.partition { |ref| ref.is_a?(self) }
lazy_values = lazy_references.group_by(&:klass).flat_map do |klass, refs|
ids = refs.flat_map(&:ids)
klass.where(id: ids)
end
values + lazy_values
end
attr_reader :klass, :ids
def initialize(klass, ids)
@klass = klass
@ids = Array.wrap(ids).map(&:to_i)
end
def load
self.klass.where(id: self.ids)
end
end
end
module Banzai
module Pipeline
class ReferenceExtractionPipeline < BasePipeline
def self.filters
FilterArray[
Filter::ReferenceGathererFilter
]
end
end
end
end
module Banzai
# Extract possible GFM references from an arbitrary String for further processing.
class ReferenceExtractor
class << self
LAZY_KEY = :banzai_reference_extractor_lazy
def lazy?
Thread.current[LAZY_KEY]
end
def lazily(values = nil, &block)
return (values || block.call).uniq if lazy?
begin
Thread.current[LAZY_KEY] = true
values ||= block.call
Banzai::LazyReference.load(values.uniq).uniq
ensure
Thread.current[LAZY_KEY] = false
end
end
end
def initialize
@texts = []
end
......@@ -31,23 +9,21 @@ module Banzai
@texts << Renderer.render(text, context)
end
def references(type, context = {})
filter = Banzai::Filter["#{type}_reference"]
def references(type, project, current_user = nil)
processor = Banzai::ReferenceParser[type].
new(project, current_user)
processor.process(html_documents)
end
context.merge!(
pipeline: :reference_extraction,
private
# ReferenceGathererFilter
reference_filter: filter
)
def html_documents
# This ensures that we don't memoize anything until we have a number of
# text blobs to parse.
return [] if @texts.empty?
self.class.lazily do
@texts.flat_map do |html|
text_context = context.dup
result = Renderer.render_result(html, text_context)
result[:references][type]
end.uniq
end
@html_documents ||= @texts.map { |html| Nokogiri::HTML.fragment(html) }
end
end
end
module Banzai
module ReferenceParser
# Returns the reference parser class for the given type
#
# Example:
#
# Banzai::ReferenceParser['issue']
#
# This would return the `Banzai::ReferenceParser::IssueParser` class.
def self.[](name)
const_get("#{name.to_s.camelize}Parser")
end
end
end
module Banzai
module ReferenceParser
# Base class for reference parsing classes.
#
# Each parser should also specify its reference type by calling
# `self.reference_type = ...` in the body of the class. The value of this
# method should be a symbol such as `:issue` or `:merge_request`. For
# example:
#
# class IssueParser < BaseParser
# self.reference_type = :issue
# end
#
# The reference type is used to determine what nodes to pass to the
# `referenced_by` method.
#
# Parser classes should either implement the instance method
# `references_relation` or overwrite `referenced_by`. The
# `references_relation` method is supposed to return an
# ActiveRecord::Relation used as a base relation for retrieving the objects
# referenced in a set of HTML nodes.
#
# Each class can implement two additional methods:
#
# * `nodes_user_can_reference`: returns an Array of nodes the given user can
# refer to.
# * `nodes_visible_to_user`: returns an Array of nodes that are visible to
# the given user.
#
# You only need to overwrite these methods if you want to tweak who can see
# which references. For example, the IssueParser class defines its own
# `nodes_visible_to_user` method so it can ensure users can only see issues
# they have access to.
class BaseParser
class << self
attr_accessor :reference_type
end
# Returns the attribute name containing the value for every object to be
# parsed by the current parser.
#
# For example, for a parser class that returns "Animal" objects this
# attribute would be "data-animal".
def self.data_attribute
@data_attribute ||= "data-#{reference_type.to_s.dasherize}"
end
def initialize(project = nil, current_user = nil)
@project = project
@current_user = current_user
end
# Returns all the nodes containing references that the user can refer to.
def nodes_user_can_reference(user, nodes)
nodes
end
# Returns all the nodes that are visible to the given user.
def nodes_visible_to_user(user, nodes)
projects = lazy { projects_for_nodes(nodes) }
project_attr = 'data-project'