gotosocial/internal/cleaner/media.go
kim 31294f7c78
[bugfix] media.Processor{}.GetFile() returning 404s on first call, correctly loading on 2nd (#3129)
* refactor file handling a tiny bit

* whoops

* make processing media / emoji defers a bit clear to see that it's the "on finished processing" path

* some wording

* add some debug logging

* add mutex locks for processing remote media

* try removing freshness check

* fix derefMedia not being allocated

* fix log format string

* handle case of empty file paths (i.e. not stored)

* remove media / emoji once finished processing from dereferencer maps

* whoops, fix the cached / force checks

* move url parsing outside of 'process___Safely()' funcs to prevalidate url

* use emoji.ShortcodeDomain()

* update RefreshEmoji() to also match RefreshMedia() changes

---------

Co-authored-by: tobi <tobi.smethurst@protonmail.com>
2024-07-22 18:45:48 +01:00

592 lines
17 KiB
Go

// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cleaner
import (
"context"
"errors"
"time"
"github.com/superseriousbusiness/gotosocial/internal/db"
"github.com/superseriousbusiness/gotosocial/internal/gtscontext"
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/media"
"github.com/superseriousbusiness/gotosocial/internal/paging"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
"github.com/superseriousbusiness/gotosocial/internal/uris"
)
// Media encompasses a set of
// media cleanup / admin utils.
type Media struct{ *Cleaner }
// All will execute all cleaner.Media utilities synchronously, including output logging.
// Context will be checked for `gtscontext.DryRun()` in order to actually perform the action.
func (m *Media) All(ctx context.Context, maxRemoteDays int) {
t := time.Now().Add(-24 * time.Hour * time.Duration(maxRemoteDays))
m.LogUncacheRemote(ctx, t)
m.LogPruneOrphaned(ctx)
m.LogPruneUnused(ctx)
m.LogFixCacheStates(ctx)
_ = m.state.Storage.Storage.Clean(ctx)
}
// LogUncacheRemote performs Media.UncacheRemote(...), logging the start and outcome.
func (m *Media) LogUncacheRemote(ctx context.Context, olderThan time.Time) {
log.Infof(ctx, "start older than: %s", olderThan.Format(time.Stamp))
if n, err := m.UncacheRemote(ctx, olderThan); err != nil {
log.Error(ctx, err)
} else {
log.Infof(ctx, "uncached: %d", n)
}
}
// LogPruneOrphaned performs Media.PruneOrphaned(...), logging the start and outcome.
func (m *Media) LogPruneOrphaned(ctx context.Context) {
log.Info(ctx, "start")
if n, err := m.PruneOrphaned(ctx); err != nil {
log.Error(ctx, err)
} else {
log.Infof(ctx, "pruned: %d", n)
}
}
// LogPruneUnused performs Media.PruneUnused(...), logging the start and outcome.
func (m *Media) LogPruneUnused(ctx context.Context) {
log.Info(ctx, "start")
if n, err := m.PruneUnused(ctx); err != nil {
log.Error(ctx, err)
} else {
log.Infof(ctx, "pruned: %d", n)
}
}
// LogFixCacheStates performs Media.FixCacheStates(...), logging the start and outcome.
func (m *Media) LogFixCacheStates(ctx context.Context) {
log.Info(ctx, "start")
if n, err := m.FixCacheStates(ctx); err != nil {
log.Error(ctx, err)
} else {
log.Infof(ctx, "fixed: %d", n)
}
}
// PruneOrphaned will delete orphaned files from storage (i.e. media missing a database entry).
// Context will be checked for `gtscontext.DryRun()` in order to actually perform the action.
func (m *Media) PruneOrphaned(ctx context.Context) (int, error) {
var files []string
// All media files in storage will have path fitting: {$account}/{$type}/{$size}/{$id}.{$ext}
if err := m.state.Storage.WalkKeys(ctx, func(path string) error {
// Check for our expected fileserver path format.
if !regexes.FilePath.MatchString(path) {
log.Warnf(ctx, "unexpected storage item: %s", path)
return nil
}
// Check whether this entry is orphaned.
orphaned, err := m.isOrphaned(ctx, path)
if err != nil {
return gtserror.Newf("error checking orphaned status: %w", err)
}
if orphaned {
// Add this orphaned entry.
files = append(files, path)
}
return nil
}); err != nil {
return 0, gtserror.Newf("error walking storage: %w", err)
}
// Delete all orphaned files from storage.
return m.removeFiles(ctx, files...)
}
// PruneUnused will delete all unused media attachments from the database and storage driver.
// Media is marked as unused if not attached to any status, account or account is suspended.
// Context will be checked for `gtscontext.DryRun()` in order to actually perform the action.
func (m *Media) PruneUnused(ctx context.Context) (int, error) {
var (
total int
page paging.Page
)
// Set page select limit.
page.Limit = selectLimit
for {
// Fetch the next batch of media attachments to next maxID.
attachments, err := m.state.DB.GetAttachments(ctx, &page)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return total, gtserror.Newf("error getting attachments: %w", err)
}
// Get current max ID.
maxID := page.Max.Value
// If no attachments or the same group is returned, we reached the end.
if len(attachments) == 0 || maxID == attachments[len(attachments)-1].ID {
break
}
// Use last ID as the next 'maxID' value.
maxID = attachments[len(attachments)-1].ID
page.Max = paging.MaxID(maxID)
for _, media := range attachments {
// Check / prune unused media attachment.
fixed, err := m.pruneUnused(ctx, media)
if err != nil {
return total, err
}
if fixed {
// Update
// count.
total++
}
}
}
return total, nil
}
// UncacheRemote will uncache all remote media attachments older than given input time.
// Context will be checked for `gtscontext.DryRun()` in order to actually perform the action.
func (m *Media) UncacheRemote(ctx context.Context, olderThan time.Time) (int, error) {
var total int
// Drop time by a minute to improve search,
// (i.e. make it olderThan inclusive search).
olderThan = olderThan.Add(-time.Minute)
// Store recent time.
mostRecent := olderThan
for {
// Fetch the next batch of cached attachments older than last-set time.
attachments, err := m.state.DB.GetCachedAttachmentsOlderThan(ctx, olderThan, selectLimit)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return total, gtserror.Newf("error getting remote attachments: %w", err)
}
// If no attachments / same group is returned, we reached the end.
if len(attachments) == 0 ||
olderThan.Equal(attachments[len(attachments)-1].CreatedAt) {
break
}
// Use last created-at as the next 'olderThan' value.
olderThan = attachments[len(attachments)-1].CreatedAt
for _, media := range attachments {
// Check / uncache each remote media attachment.
uncached, err := m.uncacheRemote(ctx, mostRecent, media)
if err != nil {
return total, err
}
if uncached {
// Update
// count.
total++
}
}
}
return total, nil
}
// FixCacheStatus will check all media for up-to-date cache status (i.e. in storage driver).
// Media marked as cached, with any required files missing, will be automatically uncached.
// Context will be checked for `gtscontext.DryRun()` in order to actually perform the action.
func (m *Media) FixCacheStates(ctx context.Context) (int, error) {
var (
total int
page paging.Page
)
// Set page select limit.
page.Limit = selectLimit
for {
// Fetch the next batch of media attachments up to next max ID.
attachments, err := m.state.DB.GetRemoteAttachments(ctx, &page)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return total, gtserror.Newf("error getting remote attachments: %w", err)
}
// Get current max ID.
maxID := page.Max.Value
// If no attachments or the same group is returned, we reached the end.
if len(attachments) == 0 || maxID == attachments[len(attachments)-1].ID {
break
}
// Use last ID as the next 'maxID' value.
maxID = attachments[len(attachments)-1].ID
page.Max = paging.MaxID(maxID)
for _, media := range attachments {
// Check / fix required media cache states.
fixed, err := m.fixCacheState(ctx, media)
if err != nil {
return total, err
}
if fixed {
// Update
// count.
total++
}
}
}
return total, nil
}
func (m *Media) isOrphaned(ctx context.Context, path string) (bool, error) {
pathParts := regexes.FilePath.FindStringSubmatch(path)
if len(pathParts) != 6 {
// This doesn't match our expectations so
// it wasn't created by gts; ignore it.
return false, nil
}
var (
// 0th -> whole match
// 1st -> account ID
mediaType = pathParts[2]
// 3rd -> media sub-type (e.g. small, static)
mediaID = pathParts[4]
// 5th -> file extension
)
// Start a log entry for media.
l := log.WithContext(ctx).
WithField("media", mediaID)
switch media.Type(mediaType) {
case media.TypeAttachment:
// Look for media in database stored by ID.
media, err := m.state.DB.GetAttachmentByID(
gtscontext.SetBarebones(ctx),
mediaID,
)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return false, gtserror.Newf("error fetching media by id %s: %w", mediaID, err)
}
if media == nil {
l.Debug("missing db entry for media")
return true, nil
}
case media.TypeEmoji:
// Generate static URL for this emoji to lookup.
staticURL := uris.URIForAttachment(
pathParts[1], // instance account ID
string(media.TypeEmoji),
string(media.SizeStatic),
mediaID,
"png",
)
// Look for emoji in database stored by static URL.
// The media ID part of the storage key for emojis can
// change for refreshed items, so search by generated URL.
emoji, err := m.state.DB.GetEmojiByStaticURL(
gtscontext.SetBarebones(ctx),
staticURL,
)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return false, gtserror.Newf("error fetching emoji by url %s: %w", staticURL, err)
}
if emoji == nil {
l.Debug("missing db entry for emoji")
return true, nil
}
}
return false, nil
}
func (m *Media) pruneUnused(ctx context.Context, media *gtsmodel.MediaAttachment) (bool, error) {
// Start a log entry for media.
l := log.WithContext(ctx).
WithField("media", media.ID)
// Check whether we have the account that owns the media.
account, missing, err := m.getOwningAccount(ctx, media)
if err != nil {
return false, err
} else if missing {
l.Debug("deleting due to missing account")
return true, m.delete(ctx, media)
}
if account != nil {
// Related account exists for this media, check whether it is being used.
headerInUse := (*media.Header && media.ID == account.HeaderMediaAttachmentID)
avatarInUse := (*media.Avatar && media.ID == account.AvatarMediaAttachmentID)
if (headerInUse || avatarInUse) && account.SuspendedAt.IsZero() {
l.Debug("skipping as account media in use")
return false, nil
}
}
// Check whether we have the required status for media.
status, missing, err := m.getRelatedStatus(ctx, media)
if err != nil {
return false, err
} else if missing {
l.Debug("deleting due to missing status")
return true, m.delete(ctx, media)
}
if status != nil {
// Check whether still attached to status.
for _, id := range status.AttachmentIDs {
if id == media.ID {
l.Debug("skippping as attached to status")
return false, nil
}
}
}
// Media totally unused, delete it.
l.Debug("deleting unused media")
return true, m.delete(ctx, media)
}
func (m *Media) fixCacheState(ctx context.Context, media *gtsmodel.MediaAttachment) (bool, error) {
// Start a log entry for media.
l := log.WithContext(ctx).
WithField("media", media.ID)
// Check whether we have the account that owns the media.
_, missingAccount, err := m.getOwningAccount(ctx, media)
if err != nil {
return false, err
} else if missingAccount {
l.Debug("skipping due to missing account")
return false, nil
}
// Check whether we have the required status for media.
_, missingStatus, err := m.getRelatedStatus(ctx, media)
if err != nil {
return false, err
} else if missingStatus {
l.Debug("skipping due to missing status")
return false, nil
}
// Check whether files exist.
exist, err := m.haveFiles(ctx,
media.Thumbnail.Path,
media.File.Path,
)
if err != nil {
return false, err
}
switch {
case *media.Cached && !exist:
// Mark as uncached if expected files don't exist.
l.Debug("cached=true exists=false => uncaching")
return true, m.uncache(ctx, media)
case !*media.Cached && exist:
// Remove files if we don't expect them to exist.
l.Debug("cached=false exists=true => deleting")
_, err := m.removeFiles(ctx,
media.Thumbnail.Path,
media.File.Path,
)
return true, err
default:
return false, nil
}
}
func (m *Media) uncacheRemote(ctx context.Context, after time.Time, media *gtsmodel.MediaAttachment) (bool, error) {
if !*media.Cached {
// Already uncached.
return false, nil
}
// Start a log entry for media.
l := log.WithContext(ctx).
WithField("media", media.ID)
// There are two possibilities here:
//
// 1. Media is an avatar or header; we should uncache
// it if we haven't seen the account recently.
// 2. Media is attached to a status; we should uncache
// it if we haven't seen the status recently.
if *media.Avatar || *media.Header {
// Check whether we have the account that owns the media.
account, missing, err := m.getOwningAccount(ctx, media)
if err != nil {
return false, err
} else if missing {
// PruneUnused will take care of this case.
l.Debug("skipping due to missing account")
return false, nil
}
if account != nil && account.FetchedAt.After(after) {
l.Debug("skipping due to recently fetched account")
return false, nil
}
} else {
// Check whether we have the status that media is attached to.
status, missing, err := m.getRelatedStatus(ctx, media)
if err != nil {
return false, err
} else if missing {
// PruneUnused will take care of this case.
l.Debug("skipping due to missing status")
return false, nil
}
if status != nil {
// Check if recently used status.
if status.FetchedAt.After(after) {
l.Debug("skipping due to recently fetched status")
return false, nil
}
// Check whether status is bookmarked by active accounts.
bookmarked, err := m.state.DB.IsStatusBookmarked(ctx, status.ID)
if err != nil {
return false, err
} else if bookmarked {
l.Debug("skipping due to bookmarked status")
return false, nil
}
}
}
// This media is too old, uncache it.
l.Debug("uncaching old remote media")
return true, m.uncache(ctx, media)
}
func (m *Media) getOwningAccount(ctx context.Context, media *gtsmodel.MediaAttachment) (*gtsmodel.Account, bool, error) {
if media.AccountID == "" {
// no related account.
return nil, false, nil
}
// Load the account that owns this media.
account, err := m.state.DB.GetAccountByID(
gtscontext.SetBarebones(ctx),
media.AccountID,
)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return nil, false, gtserror.Newf("error fetching account by id %s: %w", media.AccountID, err)
}
if account == nil {
// account is missing.
return nil, true, nil
}
return account, false, nil
}
func (m *Media) getRelatedStatus(ctx context.Context, media *gtsmodel.MediaAttachment) (*gtsmodel.Status, bool, error) {
if media.StatusID == "" {
// no related status.
return nil, false, nil
}
// Load the status related to this media.
status, err := m.state.DB.GetStatusByID(
gtscontext.SetBarebones(ctx),
media.StatusID,
)
if err != nil && !errors.Is(err, db.ErrNoEntries) {
return nil, false, gtserror.Newf("error fetching status by id %s: %w", media.StatusID, err)
}
if status == nil {
// status is missing.
return nil, true, nil
}
return status, false, nil
}
func (m *Media) uncache(ctx context.Context, media *gtsmodel.MediaAttachment) error {
if gtscontext.DryRun(ctx) {
// Dry run, do nothing.
return nil
}
// Remove media and thumbnail.
_, err := m.removeFiles(ctx,
media.File.Path,
media.Thumbnail.Path,
)
if err != nil {
return gtserror.Newf("error removing media files: %w", err)
}
// Update attachment to reflect that we no longer have it cached.
log.Debugf(ctx, "marking media attachment as uncached: %s", media.ID)
media.Cached = func() *bool { i := false; return &i }()
if err := m.state.DB.UpdateAttachment(ctx, media, "cached"); err != nil {
return gtserror.Newf("error updating media: %w", err)
}
return nil
}
func (m *Media) delete(ctx context.Context, media *gtsmodel.MediaAttachment) error {
if gtscontext.DryRun(ctx) {
// Dry run, do nothing.
return nil
}
// Remove media and thumbnail.
_, err := m.removeFiles(ctx,
media.File.Path,
media.Thumbnail.Path,
)
if err != nil {
return gtserror.Newf("error removing media files: %w", err)
}
// Delete media attachment entirely from the database.
log.Debugf(ctx, "deleting media attachment: %s", media.ID)
if err := m.state.DB.DeleteAttachment(ctx, media.ID); err != nil {
return gtserror.Newf("error deleting media: %w", err)
}
return nil
}