2023-03-12 18:00:57 +03:00
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
2021-08-10 14:32:39 +03:00
package dereferencing
import (
2021-08-25 16:34:33 +03:00
"context"
2023-06-24 10:32:10 +03:00
"net/http"
2021-08-10 14:32:39 +03:00
"net/url"
2022-07-19 11:47:55 +03:00
"codeberg.org/gruf/go-kv"
2023-06-03 12:35:15 +03:00
"github.com/superseriousbusiness/activity/pub"
2021-08-10 14:32:39 +03:00
"github.com/superseriousbusiness/gotosocial/internal/ap"
2021-12-07 15:31:39 +03:00
"github.com/superseriousbusiness/gotosocial/internal/config"
2023-05-28 15:08:35 +03:00
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
2022-09-25 14:09:41 +03:00
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
2022-07-19 11:47:55 +03:00
"github.com/superseriousbusiness/gotosocial/internal/log"
2021-08-10 14:32:39 +03:00
)
2022-09-25 14:09:41 +03:00
// maxIter defines how many iterations of descendants or
// ancesters we are willing to follow before returning error.
2024-02-23 18:24:40 +03:00
const maxIter = 512
2022-09-25 14:09:41 +03:00
2023-11-04 23:21:20 +03:00
// dereferenceThread handles dereferencing status thread after
// fetch. Passing off appropriate parts to be enqueued for async
// processing, or handling some parts synchronously when required.
func ( d * Dereferencer ) dereferenceThread (
ctx context . Context ,
requestUser string ,
uri * url . URL ,
status * gtsmodel . Status ,
statusable ap . Statusable ,
isNew bool ,
) {
if isNew {
// This is a new status that we need the ancestors of in
// order to determine visibility. Perform the initial part
// of thread dereferencing, i.e. parents, synchronously.
err := d . DereferenceStatusAncestors ( ctx , requestUser , status )
if err != nil {
log . Error ( ctx , err )
}
2021-08-10 14:32:39 +03:00
2023-11-04 23:21:20 +03:00
// Enqueue dereferencing remaining status thread, (children), asychronously .
2024-04-26 15:50:46 +03:00
d . state . Workers . Dereference . Queue . Push ( func ( ctx context . Context ) {
2023-11-04 23:21:20 +03:00
if err := d . DereferenceStatusDescendants ( ctx , requestUser , uri , statusable ) ; err != nil {
log . Error ( ctx , err )
}
} )
} else {
// This is an existing status, dereference the WHOLE thread asynchronously.
2024-04-26 15:50:46 +03:00
d . state . Workers . Dereference . Queue . Push ( func ( ctx context . Context ) {
2023-11-04 23:21:20 +03:00
if err := d . DereferenceStatusAncestors ( ctx , requestUser , status ) ; err != nil {
log . Error ( ctx , err )
}
if err := d . DereferenceStatusDescendants ( ctx , requestUser , uri , statusable ) ; err != nil {
log . Error ( ctx , err )
}
} )
2021-08-10 14:32:39 +03:00
}
}
2023-10-23 12:58:13 +03:00
// DereferenceStatusAncestors iterates upwards from the given status, using InReplyToURI, to ensure that as many parent statuses as possible are dereferenced.
func ( d * Dereferencer ) DereferenceStatusAncestors ( ctx context . Context , username string , status * gtsmodel . Status ) error {
2023-09-05 13:22:02 +03:00
// Start log entry with fields
l := log . WithContext ( ctx ) .
WithFields ( kv . Fields {
{ "username" , username } ,
{ "original" , status . URI } ,
} ... )
// Keep track of already dereferenced statuses
// for this ancestor thread to prevent recursion.
derefdStatuses := make ( map [ string ] struct { } , 10 )
2023-06-24 10:32:10 +03:00
// Mark given status as the one
// we're currently working on.
2023-09-05 13:22:02 +03:00
current := status
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
for i := 0 ; i < maxIter ; i ++ {
2023-06-24 10:32:10 +03:00
if current . InReplyToURI == "" {
// Status has no parent, we've
// reached the top of the chain.
2021-08-10 14:32:39 +03:00
return nil
}
2022-06-11 17:25:41 +03:00
2024-02-23 18:24:40 +03:00
// Apparent current parent URI to log fields.
2024-01-31 16:29:47 +03:00
l = l . WithField ( "parent" , current . InReplyToURI )
l . Trace ( "following status ancestor" )
2023-09-05 13:22:02 +03:00
// Check whether this parent has already been deref'd.
if _ , ok := derefdStatuses [ current . InReplyToURI ] ; ok {
2024-01-31 16:29:47 +03:00
l . Warn ( "self referencing status ancestor" )
2023-09-05 13:22:02 +03:00
return nil
}
2024-01-31 16:29:47 +03:00
// Add this status's parent URI to map of deref'd.
derefdStatuses [ current . InReplyToURI ] = struct { } { }
2022-09-25 14:09:41 +03:00
2024-02-23 18:24:40 +03:00
// Parse status parent URI for later use.
uri , err := url . Parse ( current . InReplyToURI )
if err != nil {
l . Warnf ( "invalid uri: %v" , err )
return nil
}
2024-01-31 16:29:47 +03:00
// Fetch parent status by current's reply URI, this handles
// case of existing (updating if necessary) or a new status.
parent , _ , _ , err := d . getStatusByURI ( ctx , username , uri )
2022-09-25 14:09:41 +03:00
2024-01-31 16:29:47 +03:00
// Check for a returned HTTP code via error.
switch code := gtserror . StatusCode ( err ) ; {
2022-09-25 14:09:41 +03:00
2024-02-18 12:49:40 +03:00
// 404 may indicate deletion, but can also
// indicate that we don't have permission to
// view the status (it's followers-only and
// we don't follow, for example).
case code == http . StatusNotFound :
2024-02-23 18:24:40 +03:00
2024-02-18 12:49:40 +03:00
// If this reply is followers-only or stricter,
// we can safely assume the status it replies
// to is also followers only or stricter.
//
// In this case we should leave the inReplyTo
// URI in place for visibility filtering,
// and just return since we can go no further.
if status . Visibility == gtsmodel . VisibilityFollowersOnly ||
status . Visibility == gtsmodel . VisibilityMutualsOnly ||
status . Visibility == gtsmodel . VisibilityDirect {
return nil
}
// If the reply is public or unlisted then
// likely the replied-to status is/was public
// or unlisted and has indeed been deleted,
// fall through to the Gone case to clean up.
fallthrough
// Gone (410) definitely indicates deletion.
// Update the status to remove references to
// the now-gone parent.
case code == http . StatusGone :
2024-01-31 16:29:47 +03:00
l . Trace ( "status orphaned" )
current . InReplyTo = nil
current . InReplyToAccount = nil
2024-02-23 18:24:40 +03:00
return d . updateStatusParent ( ctx ,
2024-01-31 16:29:47 +03:00
current ,
2024-02-23 18:24:40 +03:00
"" , // status ID
"" , // status URI
"" , // account ID
)
2023-06-24 10:32:10 +03:00
2024-01-31 16:29:47 +03:00
// An error was returned for a status during
// an attempted NEW dereference, return here.
2024-02-23 18:24:40 +03:00
//
// NOTE: this will catch all cases of a nil
// parent, all cases below can safely assume
// a non-nil parent in their code logic.
case err != nil && parent == nil :
2024-01-31 16:29:47 +03:00
return gtserror . Newf ( "error dereferencing new %s: %w" , current . InReplyToURI , err )
// An error was returned for an existing parent,
// we simply treat this as a temporary situation.
case err != nil :
l . Errorf ( "error getting parent: %v" , err )
2024-02-23 18:24:40 +03:00
}
// Start a new switch case
// as the following scenarios
// are possible with / without
// any returned error.
switch {
// The current status is using an indirect URL
// in order to reference the parent. This is just
// weird and broken... Leave the URI in place but
// don't link the statuses via database IDs as it
// could cause all sorts of unexpected situations.
case current . InReplyToURI != parent . URI :
l . Errorf ( "indirect in_reply_to_uri => %s" , parent . URI )
2024-01-31 16:29:47 +03:00
// The ID has changed for currently stored parent ID
// (which may be empty, if new!) and fetched version.
//
// Update the current's inReplyTo fields to parent.
case current . InReplyToID != parent . ID :
l . Tracef ( "parent changed %s => %s" , current . InReplyToID , parent . ID )
current . InReplyToAccount = parent . Account
2024-02-23 18:24:40 +03:00
if err := d . updateStatusParent ( ctx ,
2024-01-31 16:29:47 +03:00
current ,
2024-02-23 18:24:40 +03:00
parent . ID ,
parent . URI ,
parent . AccountID ,
2023-06-24 10:32:10 +03:00
) ; err != nil {
2024-02-23 18:24:40 +03:00
return err
2022-09-25 14:09:41 +03:00
}
}
2024-01-31 16:29:47 +03:00
// Set next parent to use.
current . InReplyTo = parent
current = current . InReplyTo
2021-08-10 14:32:39 +03:00
}
2023-06-24 10:32:10 +03:00
return gtserror . Newf ( "reached %d ancestor iterations for %q" , maxIter , status . URI )
2021-08-10 14:32:39 +03:00
}
2023-10-23 12:58:13 +03:00
// DereferenceStatusDescendents iterates downwards from the given status, using its replies, to ensure that as many children statuses as possible are dereferenced.
func ( d * Dereferencer ) DereferenceStatusDescendants ( ctx context . Context , username string , statusIRI * url . URL , parent ap . Statusable ) error {
2023-09-05 13:22:02 +03:00
statusIRIStr := statusIRI . String ( )
2022-07-19 11:47:55 +03:00
2022-09-25 14:09:41 +03:00
// Start log entry with fields
2023-02-17 14:02:29 +03:00
l := log . WithContext ( ctx ) .
WithFields ( kv . Fields {
{ "username" , username } ,
2023-09-05 13:22:02 +03:00
{ "status" , statusIRIStr } ,
2023-02-17 14:02:29 +03:00
} ... )
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
// Log function start
l . Trace ( "beginning" )
2023-09-05 13:22:02 +03:00
// OUR instance hostname.
localhost := config . GetHost ( )
// Keep track of already dereferenced collection
// pages for this thread to prevent recursion.
derefdPages := make ( map [ string ] struct { } , 10 )
// frame represents a single stack frame when
// iteratively derefencing status descendants.
2022-09-25 14:09:41 +03:00
type frame struct {
2023-09-05 13:22:02 +03:00
// page is the current activity streams
// collection page we are on (as we often
// push a frame to stack mid-paging).
2023-09-23 20:28:12 +03:00
page ap . CollectionPageIterator
2023-09-05 13:22:02 +03:00
// pageURI is the URI string of
// the frame's collection page
// (is useful for logging).
pageURI string
2021-08-10 14:32:39 +03:00
}
2022-09-25 14:09:41 +03:00
var (
2023-09-05 13:22:02 +03:00
// current stack frame
2022-09-25 14:09:41 +03:00
current * frame
// stack is a list of "shelved" descendand iterator
// frames. this is pushed to when a child status frame
// is found that we need to further iterate down, and
// popped from into 'current' when that child's tree
// of further descendants is exhausted.
stack = [ ] * frame {
2023-09-05 13:22:02 +03:00
func ( ) * frame {
// Start input frame is built from the first input.
2023-09-23 20:28:12 +03:00
page , pageURI := getAttachedStatusCollectionPage ( parent )
2023-09-05 13:22:02 +03:00
if page == nil {
return nil
}
return & frame { page : page , pageURI : pageURI }
} ( ) ,
2022-09-25 14:09:41 +03:00
}
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
// popStack will remove and return the top frame
// from the stack, or nil if currently empty.
popStack = func ( ) * frame {
if len ( stack ) == 0 {
return nil
}
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
// Get frame index
idx := len ( stack ) - 1
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
// Pop last frame
frame := stack [ idx ]
stack = stack [ : idx ]
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
return frame
}
)
2021-08-10 14:32:39 +03:00
2022-09-25 14:09:41 +03:00
stackLoop :
for i := 0 ; i < maxIter ; i ++ {
// Pop next frame, nil means we are at end
if current = popStack ( ) ; current == nil {
return nil
2021-08-10 14:32:39 +03:00
}
2022-09-26 11:14:36 +03:00
pageLoop :
for {
2023-09-05 13:22:02 +03:00
l . Tracef ( "following collection page: %s" , current . pageURI )
2022-09-25 14:09:41 +03:00
itemLoop :
2022-09-26 11:50:14 +03:00
for {
2023-09-23 20:28:12 +03:00
// Get next item from page iter.
next := current . page . NextItem ( )
if next == nil {
2022-09-26 11:50:14 +03:00
break itemLoop
}
2021-08-10 14:32:39 +03:00
2024-01-31 16:29:47 +03:00
// Check for available IRI.
2023-09-23 20:28:12 +03:00
itemIRI , _ := pub . ToId ( next )
2022-09-25 14:09:41 +03:00
if itemIRI == nil {
continue itemLoop
}
2023-09-05 13:22:02 +03:00
if itemIRI . Host == localhost {
2022-09-25 14:09:41 +03:00
// This child is one of ours,
continue itemLoop
}
2023-05-12 12:15:54 +03:00
// Dereference the remote status and store in the database.
2023-06-24 10:32:10 +03:00
// getStatusByURI guards against the following conditions:
2023-09-05 13:22:02 +03:00
// - refetching recently fetched statuses (recursion!)
2023-06-24 10:32:10 +03:00
// - remote domain is blocked (will return unretrievable)
2023-09-05 13:22:02 +03:00
// - any http type error for a new status returns unretrievable
2023-11-04 23:21:20 +03:00
_ , statusable , _ , err := d . getStatusByURI ( ctx , username , itemIRI )
2022-09-25 14:09:41 +03:00
if err != nil {
2024-01-31 16:29:47 +03:00
l . Errorf ( "error dereferencing remote status %s: %v" , itemIRI , err )
2023-05-12 12:15:54 +03:00
continue itemLoop
}
if statusable == nil {
2023-09-05 13:22:02 +03:00
// A nil statusable return from
// getStatusByURI() indicates a
// remote status that was already
// dereferenced recently (so no
// need to go through descendents).
continue itemLoop
}
2023-09-23 20:28:12 +03:00
// Extract any attached collection + ID URI from status.
page , pageURI := getAttachedStatusCollectionPage ( statusable )
2023-09-05 13:22:02 +03:00
if page == nil {
2022-09-25 14:09:41 +03:00
continue itemLoop
2021-08-10 14:32:39 +03:00
}
2022-09-25 14:09:41 +03:00
// Put current and next frame at top of stack
stack = append ( stack , current , & frame {
2023-09-05 13:22:02 +03:00
pageURI : pageURI ,
page : page ,
2022-09-25 14:09:41 +03:00
} )
2022-09-26 10:39:59 +03:00
// Now start at top of loop
continue stackLoop
2021-08-10 14:32:39 +03:00
}
2023-09-23 20:28:12 +03:00
// Get the next page from iterator.
next := current . page . NextPage ( )
if next == nil || ! next . IsIRI ( ) {
2022-09-25 14:09:41 +03:00
continue stackLoop
}
2023-09-23 20:28:12 +03:00
// Get the next page IRI.
nextURI := next . GetIRI ( )
nextURIStr := nextURI . String ( )
2023-09-05 13:22:02 +03:00
// Check whether this page has already been deref'd.
2023-09-23 20:28:12 +03:00
if _ , ok := derefdPages [ nextURIStr ] ; ok {
l . Warnf ( "self referencing collection page(s): %s" , nextURIStr )
2022-09-26 11:14:36 +03:00
continue stackLoop
}
2022-09-25 14:09:41 +03:00
2023-09-05 13:22:02 +03:00
// Mark this collection page as deref'd.
2023-09-23 20:28:12 +03:00
derefdPages [ nextURIStr ] = struct { } { }
2023-09-05 13:22:02 +03:00
// Dereference this next collection page by its IRI.
2023-05-12 12:15:54 +03:00
collectionPage , err := d . dereferenceCollectionPage ( ctx ,
username ,
2023-09-23 20:28:12 +03:00
nextURI ,
2023-05-12 12:15:54 +03:00
)
2022-09-25 14:09:41 +03:00
if err != nil {
2023-09-23 20:28:12 +03:00
l . Errorf ( "error dereferencing collection page %q: %s" , nextURIStr , err )
2022-09-25 14:09:41 +03:00
continue stackLoop
}
2023-09-05 13:22:02 +03:00
// Set the next collection page.
2022-09-25 14:09:41 +03:00
current . page = collectionPage
2023-09-23 20:28:12 +03:00
current . pageURI = nextURIStr
2022-09-26 11:14:36 +03:00
continue pageLoop
2021-08-10 14:32:39 +03:00
}
}
2023-09-05 13:22:02 +03:00
return gtserror . Newf ( "reached %d descendant iterations for %q" , maxIter , statusIRIStr )
}
2024-02-23 18:24:40 +03:00
// updateStatusParent updates the given status' parent
// status URI, ID and account ID to given values in DB.
func ( d * Dereferencer ) updateStatusParent (
ctx context . Context ,
status * gtsmodel . Status ,
parentStatusID string ,
parentStatusURI string ,
parentAccountID string ,
) error {
status . InReplyToAccountID = parentAccountID
status . InReplyToURI = parentStatusURI
status . InReplyToID = parentStatusID
if err := d . state . DB . UpdateStatus ( ctx ,
status ,
"in_reply_to_id" ,
"in_reply_to_uri" ,
"in_reply_to_account_id" ,
) ; err != nil {
return gtserror . Newf ( "error updating status %s: %w" , status . URI , err )
}
return nil
}