Content
View differences
Updated by Christophe Bliard over 1 year ago
**As** an administrator
**I want to** have the ability to generate an anonymized database dump
**so that** I can sent it in for debugging without revealing sensitive information
**Acceptance criteria**
* rake task to generate an anonymized database dump
* structure same as original but text based columns anonymized
* compressed
* anonymization is conducted on a copy of the original database so that the original database is not tampered with
* Documented
#### Implementation notes
Work in progress SQL script, save it as `cleanup.sql`:
```SQL
DELETE FROM sessions;
DELETE FROM user_passwords;
DELETE FROM two_factor_authentication_devices;
DELETE FROM tokens;
DELETE FROM enterprise_tokens;
DELETE FROM recaptcha_entries;
DELETE FROM job_statuses;
DELETE FROM good_jobs;
DELETE FROM good_job_batches;
DELETE FROM good_job_executions;
DELETE FROM good_job_processes;
DELETE FROM good_job_settings;
DELETE FROM deploy_targets;
DELETE FROM deploy_status_checks;
DELETE FROM storages;
DELETE FROM storages_file_links_journals;
DELETE FROM project_storages;
DELETE FROM last_project_folders;
DELETE FROM remote_identities;
DELETE FROM file_links;
DELETE FROM oauth_access_tokens;
DELETE FROM oauth_access_grants;
DELETE FROM oauth_applications;
DELETE FROM oauth_client_tokens;
DELETE FROM oauth_clients;
DELETE FROM oidc_user_session_links;
DELETE FROM webhooks_events;
DELETE FROM webhooks_logs;
DELETE FROM webhooks_webhooks;
DELETE FROM paper_trail_audits;
DELETE FROM settings WHERE name = 'welcome_text';
DELETE FROM settings WHERE name = 'welcome_title';
DELETE FROM settings WHERE name = 'app_title';
DELETE FROM settings WHERE name = 'mail_from';
DELETE FROM settings WHERE name = 'consent_info';
UPDATE attachments SET file_tsv = NULL, fulltext = NULL, fulltext_tsv = NULL;
DO $$
DECLARE table_name TEXT;
DECLARE column_name TEXT;
BEGIN
SET client_min_messages TO INFO;
FOR table_name, column_name IN (
SELECT DISTINCT information_schema.columns.table_name, information_schema.columns.column_name FROM information_schema.columns WHERE information_schema.columns.table_schema = 'public' AND data_type IN ('character varying', 'text')
AND information_schema.columns.table_name NOT IN
(
'ar_internal_metadata',
'audits',
'schema_migrations',
'colors',
'changes',
'delayed_jobs',
'github_check_runs',
'github_pull_requests',
'grid_widgets',
'paper_trail_audits',
'custom_values',
'customizable_values',
'custom_fields',
'roles',
'enumerations',
'queries',
'statuses',
'settings',
'role_permissions',
'enabled_modules',
'two_factor_authentication_devices',
'tokens'
)
AND information_schema.columns.column_name NOT LIKE '%type%'
AND NOT (information_schema.columns.table_name = 'grids' AND information_schema.columns.column_name = 'options')
AND NOT (information_schema.columns.table_name = 'users' AND information_schema.columns.column_name = 'language')
AND NOT (information_schema.columns.table_name = 'types' AND information_schema.columns.column_name = 'attribute_groups')
)
LOOP
RAISE INFO '%', CONCAT('UPDATE ', table_name, ' SET ', column_name, '=MD5(', column_name, ') WHERE NOT ', column_name, ' = '''';');
EXECUTE CONCAT('UPDATE ', table_name, ' SET ', column_name, '=MD5(', column_name, ') WHERE NOT ', column_name, ' = '''';');
END LOOP;
END $$;
UPDATE roles SET name = MD5(name)::varchar(30);
UPDATE enumerations SET name = MD5(name)::varchar(30);
UPDATE custom_fields SET name = MD5(name)::varchar(30);
UPDATE statuses SET name = MD5(name)::varchar(30);
UPDATE queries SET name = MD5(name)::varchar(30);
UPDATE custom_values SET value = MD5(value) WHERE custom_field_id in (SELECT id from custom_fields where field_format IN ('text', 'string'));
UPDATE customizable_journals SET value = MD5(value) WHERE custom_field_id in (SELECT id from custom_fields where field_format IN ('text', 'string'));
UPDATE grid_widgets grids SET options = '---\n:name: Custom title\n:text: Custom text\n' WHERE identifier = 'custom_text';
```
Also, some additional steps are performed in a separate wrapping script
```bash
#!/bin/bash
set -e
DUMPFILE="${1:?Pass the dump file as first argument}"
[ -f "~/openproject/dev" ] && OPENPROJECT_DEV_DIR=~/openproject/dev \
|| [ -f "/code/opf/openproject" ] && OPENPROJECT_DEV_DIR=/code/opf/openproject \
|| OPENPROJECT_DEV_DIR="${2:?Pass the openproject dev directory as second argument}" file}"
[ -d "$OPENPROJECT_DEV_DIR" ] || { echo "OpenProject dev directory $OPENPROJECT_DEV_DIR does not exist"; exit 1; }
[ -f "$OPENPROJECT_DEV_DIR/Gemfile" ] || { echo "$OPENPROJECT_DEV_DIR does not seem to be a openproject repository"; exit 1; }
dropdb --if-exists tmp
createdb tmp
psql --dbname -d tmp --command -c "DROP SCHEMA public;"
psql --dbname -d tmp --command -c "CREATE EXTENSION IF NOT EXISTS btree_gist WITH SCHEMA pg_catalog;"
psql --dbname -d tmp --command -c "CREATE EXTENSION IF NOT EXISTS pg_trgm WITH SCHEMA pg_catalog;"
psql --dbname -d tmp < $DUMPFILE
SCHEMA=$(psql --tuples-only --quiet --dbname -t -d tmp --command -c "select schema_name from information_schema.schemata WHERE schema_owner != 'postgres';" | tr -d '[:space:]')
echo "RENAMING $SCHEMA to public";
psql --dbname -d tmp --command -c "ALTER SCHEMA \"$SCHEMA\" RENAME TO \"public\";"
psql --dbname -d tmp --command -c "UPDATE settings SET value = 'localhost:3000' WHERE name = 'host_name'"
cat cleanup.sql | psql -d tmp
rubytmpfile=$(mktemp /tmp/cleanup-script.XXXXXX)
cat <<EOT >> $rubytmpfile
Grids::Widget.where(identifier: 'custom_text').update_all(options: { name: "Custom title", text: 'Custom text' } )
u = User.new login: 'admin', mail: 'admin@localhost', firstname: 'admin', lastname: 'lastname', password: 'admin', password_confirmation: 'admin', admin: true; u.save!(validate: false)
EOT
cd "$OPENPROJECT_DEV_DIR" ~/openproject/dev && DATABASE_URL=postgres:///tmp bundle exec rails runner $rubytmpfile
pg_dump -Fp tmp > tmp.dump.sql
echo "Anonymized dump saved to tmp.dump.sql"
echo "Load it like this:"
echo " createdb -O openproject openproject_debug_me"
echo " psql --username=openproject openproject_debug_me < tmp.dump.sql"
echo "Then log in with user: admin, password: admin"
tmp.dump
```
**I want to** have the ability to generate an anonymized database dump
**so that** I can sent it in for debugging without revealing sensitive information
**Acceptance criteria**
* rake task to generate an anonymized database dump
* structure same as original but text based columns anonymized
* compressed
* anonymization is conducted on a copy of the original database so that the original database is not tampered with
* Documented
#### Implementation notes
Work in progress SQL script, save it as `cleanup.sql`:
```SQL
DELETE FROM sessions;
DELETE FROM user_passwords;
DELETE FROM two_factor_authentication_devices;
DELETE FROM tokens;
DELETE FROM enterprise_tokens;
DELETE FROM recaptcha_entries;
DELETE FROM job_statuses;
DELETE FROM good_jobs;
DELETE FROM good_job_batches;
DELETE FROM good_job_executions;
DELETE FROM good_job_processes;
DELETE FROM good_job_settings;
DELETE FROM deploy_targets;
DELETE FROM deploy_status_checks;
DELETE FROM storages;
DELETE FROM storages_file_links_journals;
DELETE FROM project_storages;
DELETE FROM last_project_folders;
DELETE FROM remote_identities;
DELETE FROM file_links;
DELETE FROM oauth_access_tokens;
DELETE FROM oauth_access_grants;
DELETE FROM oauth_applications;
DELETE FROM oauth_client_tokens;
DELETE FROM oauth_clients;
DELETE FROM oidc_user_session_links;
DELETE FROM webhooks_events;
DELETE FROM webhooks_logs;
DELETE FROM webhooks_webhooks;
DELETE FROM paper_trail_audits;
DELETE FROM settings WHERE name = 'welcome_text';
DELETE FROM settings WHERE name = 'welcome_title';
DELETE FROM settings WHERE name = 'app_title';
DELETE FROM settings WHERE name = 'mail_from';
DELETE FROM settings WHERE name = 'consent_info';
UPDATE attachments SET file_tsv = NULL, fulltext = NULL, fulltext_tsv = NULL;
DO $$
DECLARE table_name TEXT;
DECLARE column_name TEXT;
BEGIN
SET client_min_messages TO INFO;
FOR table_name, column_name IN (
SELECT DISTINCT information_schema.columns.table_name, information_schema.columns.column_name FROM information_schema.columns WHERE information_schema.columns.table_schema = 'public' AND data_type IN ('character varying', 'text')
AND information_schema.columns.table_name NOT IN
(
'ar_internal_metadata',
'audits',
'schema_migrations',
'colors',
'changes',
'delayed_jobs',
'github_check_runs',
'github_pull_requests',
'grid_widgets',
'paper_trail_audits',
'custom_values',
'customizable_values',
'custom_fields',
'roles',
'enumerations',
'queries',
'statuses',
'settings',
'role_permissions',
'enabled_modules',
'two_factor_authentication_devices',
'tokens'
)
AND information_schema.columns.column_name NOT LIKE '%type%'
AND NOT (information_schema.columns.table_name = 'grids' AND information_schema.columns.column_name = 'options')
AND NOT (information_schema.columns.table_name = 'users' AND information_schema.columns.column_name = 'language')
AND NOT (information_schema.columns.table_name = 'types' AND information_schema.columns.column_name = 'attribute_groups')
)
LOOP
RAISE INFO '%', CONCAT('UPDATE ', table_name, ' SET ', column_name, '=MD5(', column_name, ') WHERE NOT ', column_name, ' = '''';');
EXECUTE CONCAT('UPDATE ', table_name, ' SET ', column_name, '=MD5(', column_name, ') WHERE NOT ', column_name, ' = '''';');
END LOOP;
END $$;
UPDATE roles SET name = MD5(name)::varchar(30);
UPDATE enumerations SET name = MD5(name)::varchar(30);
UPDATE custom_fields SET name = MD5(name)::varchar(30);
UPDATE statuses SET name = MD5(name)::varchar(30);
UPDATE queries SET name = MD5(name)::varchar(30);
UPDATE custom_values SET value = MD5(value) WHERE custom_field_id in (SELECT id from custom_fields where field_format IN ('text', 'string'));
UPDATE customizable_journals SET value = MD5(value) WHERE custom_field_id in (SELECT id from custom_fields where field_format IN ('text', 'string'));
UPDATE grid_widgets
```
Also, some additional steps are performed in a separate wrapping script
```bash
#!/bin/bash
set -e
DUMPFILE="${1:?Pass the dump file as first argument}"
[ -f "~/openproject/dev" ] && OPENPROJECT_DEV_DIR=~/openproject/dev \
|| [ -f "/code/opf/openproject" ] && OPENPROJECT_DEV_DIR=/code/opf/openproject \
|| OPENPROJECT_DEV_DIR="${2:?Pass the openproject dev directory as second argument}"
[ -d "$OPENPROJECT_DEV_DIR" ] || { echo "OpenProject dev directory $OPENPROJECT_DEV_DIR does not exist"; exit 1; }
[ -f "$OPENPROJECT_DEV_DIR/Gemfile" ] || { echo "$OPENPROJECT_DEV_DIR does not seem to be a openproject repository"; exit 1; }
dropdb --if-exists tmp
createdb tmp
psql --dbname
psql --dbname
psql --dbname
psql --dbname
SCHEMA=$(psql --tuples-only --quiet --dbname
echo "RENAMING $SCHEMA to public";
psql --dbname
psql --dbname
cat cleanup.sql | psql -d tmp
rubytmpfile=$(mktemp /tmp/cleanup-script.XXXXXX)
cat <<EOT >> $rubytmpfile
Grids::Widget.where(identifier: 'custom_text').update_all(options: { name: "Custom title", text: 'Custom text' } )
u = User.new login: 'admin', mail: 'admin@localhost', firstname: 'admin', lastname: 'lastname', password: 'admin', password_confirmation: 'admin', admin: true; u.save!(validate: false)
EOT
cd "$OPENPROJECT_DEV_DIR"
pg_dump -Fp tmp > tmp.dump.sql
echo "Anonymized dump saved to tmp.dump.sql"
echo "Load it like this:"
echo " createdb -O openproject openproject_debug_me"
echo " psql --username=openproject openproject_debug_me < tmp.dump.sql"
echo "Then log in with user: admin, password: admin"