From f5bac5fda88b4562efadb22abefc09d81c1cf631 Mon Sep 17 00:00:00 2001 From: Jan Provaznik Date: Tue, 26 Mar 2019 16:08:41 +0000 Subject: [PATCH] Rake task for removing exif from uploads Adds a rake task which can be used for removing EXIF data from existing uploads. --- GITLAB_WORKHORSE_VERSION | 2 +- .../unreleased/security-exif-migration.yml | 5 + .../raketasks/uploads/sanitize.md | 62 +++++++ doc/raketasks/README.md | 1 + lib/gitlab/sanitizers/exif.rb | 157 ++++++++++++++++++ lib/tasks/gitlab/uploads/sanitize.rake | 18 ++ spec/lib/gitlab/sanitizers/exif_spec.rb | 120 +++++++++++++ 7 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 changelogs/unreleased/security-exif-migration.yml create mode 100644 doc/administration/raketasks/uploads/sanitize.md create mode 100644 lib/gitlab/sanitizers/exif.rb create mode 100644 lib/tasks/gitlab/uploads/sanitize.rake create mode 100644 spec/lib/gitlab/sanitizers/exif_spec.rb diff --git a/GITLAB_WORKHORSE_VERSION b/GITLAB_WORKHORSE_VERSION index 56b6be4ebb2..9c78b761ea1 100644 --- a/GITLAB_WORKHORSE_VERSION +++ b/GITLAB_WORKHORSE_VERSION @@ -1 +1 @@ -8.3.1 +8.3.2 diff --git a/changelogs/unreleased/security-exif-migration.yml b/changelogs/unreleased/security-exif-migration.yml new file mode 100644 index 00000000000..cc529099df5 --- /dev/null +++ b/changelogs/unreleased/security-exif-migration.yml @@ -0,0 +1,5 @@ +--- +title: Added rake task for removing EXIF data from existing uploads. +merge_request: +author: +type: security diff --git a/doc/administration/raketasks/uploads/sanitize.md b/doc/administration/raketasks/uploads/sanitize.md new file mode 100644 index 00000000000..54a423b9571 --- /dev/null +++ b/doc/administration/raketasks/uploads/sanitize.md @@ -0,0 +1,62 @@ +# Uploads Sanitize tasks + +## Requirements + +You need `exiftool` installed on your system. If you installed GitLab: + +- Using the Omnibus package, you're all set. +- From source, make sure `exiftool` is installed: + + ```sh + # Debian/Ubuntu + sudo apt-get install libimage-exiftool-perl + + # RHEL/CentOS + sudo yum install perl-Image-ExifTool + ``` + +## Remove EXIF data from existing uploads + +Since 11.9 EXIF data are automatically stripped from JPG or TIFF image uploads. +Because EXIF data may contain sensitive information (e.g. GPS location), you +can remove EXIF data also from existing images which were uploaded before +with the following command: + +```bash +sudo RAILS_ENV=production -u git -H bundle exec rake gitlab:uploads:sanitize:remove_exif +``` + +This command by default runs in dry mode and it doesn't remove EXIF data. It can be used for +checking if (and how many) images should be sanitized. + +The rake task accepts following parameters. + +Parameter | Type | Description +--------- | ---- | ----------- +`start_id` | integer | Only uploads with equal or greater ID will be processed +`stop_id` | integer | Only uploads with equal or smaller ID will be processed +`dry_run` | boolean | Do not remove EXIF data, only check if EXIF data are present or not, default: true +`sleep_time` | float | Pause for number of seconds after processing each image, default: 0.3 seconds + +If you have too many uploads, you can speed up sanitization by setting +`sleep_time` to a lower value or by running multiple rake tasks in parallel, +each with a separate range of upload IDs (by setting `start_id` and `stop_id`). + +To run the command without dry mode and remove EXIF data from all uploads, you can use: + +```bash +sudo RAILS_ENV=production -u git -H bundle exec rake gitlab:uploads:sanitize:remove_exif[,,false,] 2>&1 | tee exif.log +``` + +To run the command without dry mode on uploads with ID between 100 and 5000 and pause for 0.1 second, you can use: + +```bash +sudo RAILS_ENV=production -u git -H bundle exec rake gitlab:uploads:sanitize:remove_exif[100,5000,false,0.1] 2>&1 | tee exif.log +``` + +Because the output of commands will be probably long, the output is written also into exif.log file. + +If sanitization fails for an upload, an error message should be in the output of the rake task (typical reasons may +be that the file is missing in the storage or it's not a valid image). Please +[report](https://gitlab.com/gitlab-org/gitlab-ce/issues/new) any issues at `gitlab.com` and use +prefix 'EXIF' in issue title with the error output and (if possible) the image. diff --git a/doc/raketasks/README.md b/doc/raketasks/README.md index 90187617c41..0729875daf8 100644 --- a/doc/raketasks/README.md +++ b/doc/raketasks/README.md @@ -15,3 +15,4 @@ comments: false - [Import](import.md) of git repositories in bulk - [Rebuild authorized_keys file](http://docs.gitlab.com/ce/raketasks/maintenance.html#rebuild-authorized_keys-file) task for administrators - [Migrate Uploads](../administration/raketasks/uploads/migrate.md) +- [Sanitize Uploads](../administration/raketasks/uploads/sanitize.md) diff --git a/lib/gitlab/sanitizers/exif.rb b/lib/gitlab/sanitizers/exif.rb new file mode 100644 index 00000000000..7427aba382f --- /dev/null +++ b/lib/gitlab/sanitizers/exif.rb @@ -0,0 +1,157 @@ +# frozen_string_literal: true + +module Gitlab + module Sanitizers + class Exif + # these tags are not removed from the image + WHITELISTED_TAGS = %w( + ResolutionUnit + XResolution + YResolution + YCbCrSubSampling + YCbCrPositioning + BitsPerSample + ImageHeight + ImageWidth + ImageSize + Copyright + CopyrightNotice + ).freeze + + # these tags are common in exiftool output, these + # do not contain any sensitive information, but + # we don't need to preserve them when removing + # exif tags + IGNORED_TAGS = %w( + ColorComponents + EncodingProcess + ExifByteOrder + ExifToolVersion + JFIFVersion + Directory + FileAccessDate + FileInodeChangeDate + FileModifyDate + FileName + FilePermissions + FileSize + SourceFile + Megapixels + FileType + FileTypeExtension + MIMEType + ).freeze + + ALLOWED_TAGS = WHITELISTED_TAGS + IGNORED_TAGS + EXCLUDE_PARAMS = WHITELISTED_TAGS.map { |tag| "-#{tag}" } + + attr_reader :logger + + def initialize(logger: Rails.logger) + @logger = logger + end + + # rubocop: disable CodeReuse/ActiveRecord + def batch_clean(start_id: nil, stop_id: nil, dry_run: true, sleep_time: nil) + relation = Upload.where('lower(path) like ? or lower(path) like ? or lower(path) like ?', + '%.jpg', '%.jpeg', '%.tiff') + + logger.info "running in dry run mode, no images will be rewritten" if dry_run + + find_params = { + start: start_id.present? ? start_id.to_i : nil, + finish: stop_id.present? ? stop_id.to_i : Upload.last&.id + } + + relation.find_each(find_params) do |upload| + begin + clean(upload.build_uploader, dry_run: dry_run) + sleep sleep_time if sleep_time + rescue => err + logger.error "failed to sanitize #{upload_ref(upload)}: #{err.message}" + logger.debug err.backtrace.join("\n ") + end + end + end + # rubocop: enable CodeReuse/ActiveRecord + + def clean(uploader, dry_run: true) + Dir.mktmpdir('gitlab-exif') do |tmpdir| + src_path = fetch_upload_to_file(uploader, tmpdir) + + to_remove = extra_tags(src_path) + + if to_remove.empty? + logger.info "#{upload_ref(uploader.upload)}: only whitelisted tags present, skipping" + break + end + + logger.info "#{upload_ref(uploader.upload)}: found exif tags to remove: #{to_remove}" + + break if dry_run + + remove_and_store(tmpdir, src_path, uploader) + end + end + + def extra_tags(path) + exif_tags(path).keys - ALLOWED_TAGS + end + + private + + def remove_and_store(tmpdir, src_path, uploader) + exec_remove_exif!(src_path) + logger.info "#{upload_ref(uploader.upload)}: exif removed, storing" + File.open(src_path, 'r') { |f| uploader.store!(f) } + end + + def exec_remove_exif!(path) + # IPTC and XMP-iptcExt groups may keep copyright information so + # we always preserve them + cmd = ["exiftool", "-all=", "-tagsFromFile", "@", *EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", path] + output, status = Gitlab::Popen.popen(cmd) + + if status != 0 + raise "exiftool return code is #{status}: #{output}" + end + + if File.size(path) == 0 + raise "size of file is 0" + end + + # exiftool creates backup of the original file in filename_original + old_path = "#{path}_original" + if File.size(path) == File.size(old_path) + raise "size of sanitized file is same as original size" + end + end + + def fetch_upload_to_file(uploader, dir) + # upload is stored into the file with the original name - this filename + # is used by carrierwave when storing the file back to the storage + filename = File.join(dir, uploader.filename) + + File.open(filename, 'w') do |file| + file.binmode + file.write uploader.read + end + + filename + end + + def upload_ref(upload) + "#{upload.id}:#{upload.path}" + end + + def exif_tags(path) + cmd = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", path] + output, status = Gitlab::Popen.popen(cmd) + + raise "failed to get exif tags: #{output}" if status != 0 + + JSON.parse(output).first + end + end + end +end diff --git a/lib/tasks/gitlab/uploads/sanitize.rake b/lib/tasks/gitlab/uploads/sanitize.rake new file mode 100644 index 00000000000..12cf5302555 --- /dev/null +++ b/lib/tasks/gitlab/uploads/sanitize.rake @@ -0,0 +1,18 @@ +namespace :gitlab do + namespace :uploads do + namespace :sanitize do + desc 'GitLab | Uploads | Remove EXIF from images.' + task :remove_exif, [:start_id, :stop_id, :dry_run, :sleep_time] => :environment do |task, args| + args.with_defaults(dry_run: 'true') + args.with_defaults(sleep_time: 0.3) + + logger = Logger.new(STDOUT) + + sanitizer = Gitlab::Sanitizers::Exif.new(logger: logger) + sanitizer.batch_clean(start_id: args.start_id, stop_id: args.stop_id, + dry_run: args.dry_run != 'false', + sleep_time: args.sleep_time.to_f) + end + end + end +end diff --git a/spec/lib/gitlab/sanitizers/exif_spec.rb b/spec/lib/gitlab/sanitizers/exif_spec.rb new file mode 100644 index 00000000000..bd5f330c7a1 --- /dev/null +++ b/spec/lib/gitlab/sanitizers/exif_spec.rb @@ -0,0 +1,120 @@ +require 'spec_helper' + +describe Gitlab::Sanitizers::Exif do + let(:sanitizer) { described_class.new } + + describe '#batch_clean' do + context 'with image uploads' do + let!(:uploads) { create_list(:upload, 3, :with_file, :issuable_upload) } + + it 'processes all uploads if range ID is not set' do + expect(sanitizer).to receive(:clean).exactly(3).times + + sanitizer.batch_clean + end + + it 'processes only uploads in the selected range' do + expect(sanitizer).to receive(:clean).once + + sanitizer.batch_clean(start_id: uploads[1].id, stop_id: uploads[1].id) + end + + it 'pauses if sleep_time is set' do + expect(sanitizer).to receive(:sleep).exactly(3).times.with(1.second) + expect(sanitizer).to receive(:clean).exactly(3).times + + sanitizer.batch_clean(sleep_time: 1) + end + end + + it 'filters only jpg/tiff images' do + create(:upload, path: 'filename.jpg') + create(:upload, path: 'filename.jpeg') + create(:upload, path: 'filename.JPG') + create(:upload, path: 'filename.tiff') + create(:upload, path: 'filename.TIFF') + create(:upload, path: 'filename.png') + create(:upload, path: 'filename.txt') + + expect(sanitizer).to receive(:clean).exactly(5).times + sanitizer.batch_clean + end + end + + describe '#clean' do + let(:uploader) { create(:upload, :with_file, :issuable_upload).build_uploader } + + context "no dry run" do + it "removes exif from the image" do + uploader.store!(fixture_file_upload('spec/fixtures/rails_sample.jpg')) + + original_upload = uploader.upload + expected_args = ["exiftool", "-all=", "-tagsFromFile", "@", *Gitlab::Sanitizers::Exif::EXCLUDE_PARAMS, "--IPTC:all", "--XMP-iptcExt:all", kind_of(String)] + + expect(sanitizer).to receive(:extra_tags).and_return(["", 0]) + expect(sanitizer).to receive(:exec_remove_exif!).once.and_call_original + expect(uploader).to receive(:store!).and_call_original + expect(Gitlab::Popen).to receive(:popen).with(expected_args) do |args| + File.write("#{args.last}_original", "foo") if args.last.start_with?(Dir.tmpdir) + + [expected_args, 0] + end + + sanitizer.clean(uploader, dry_run: false) + + expect(uploader.upload.id).not_to eq(original_upload.id) + expect(uploader.upload.path).to eq(original_upload.path) + end + + it "ignores image without exif" do + expected_args = ["exiftool", "-all", "-j", "-sort", "--IPTC:all", "--XMP-iptcExt:all", kind_of(String)] + + expect(Gitlab::Popen).to receive(:popen).with(expected_args).and_return(["[{}]", 0]) + expect(sanitizer).not_to receive(:exec_remove_exif!) + expect(uploader).not_to receive(:store!) + + sanitizer.clean(uploader, dry_run: false) + end + + it "raises an error if the exiftool fails with an error" do + expect(Gitlab::Popen).to receive(:popen).and_return(["error", 1]) + + expect { sanitizer.clean(uploader, dry_run: false) }.to raise_exception(RuntimeError, "failed to get exif tags: error") + end + end + + context "dry run" do + it "doesn't change the image" do + expect(sanitizer).to receive(:extra_tags).and_return({ 'foo' => 'bar' }) + expect(sanitizer).not_to receive(:exec_remove_exif!) + expect(uploader).not_to receive(:store!) + + sanitizer.clean(uploader, dry_run: true) + end + end + end + + describe "#extra_tags" do + it "returns a list of keys for exif file" do + tags = '[{ + "DigitalSourceType": "some source", + "ImageHeight": 654 + }]' + + expect(Gitlab::Popen).to receive(:popen).and_return([tags, 0]) + + expect(sanitizer.extra_tags('filename')).not_to be_empty + end + + it "returns an empty list for file with only whitelisted and ignored tags" do + tags = '[{ + "ImageHeight": 654, + "Megapixels": 0.641 + }]' + + expect(Gitlab::Popen).to receive(:popen).and_return([tags, 0]) + + expect(sanitizer.extra_tags('some file')).to be_empty + end + end +end -- GitLab