From e4f2c6c403ba5426e1f75f476df0e70c4f676ae7 Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Mon, 13 Jan 2020 15:11:20 +0100 Subject: [PATCH 1/7] Add get_limits helper --- fbchat/_util.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fbchat/_util.py b/fbchat/_util.py index 6beed9e..4831fa7 100644 --- a/fbchat/_util.py +++ b/fbchat/_util.py @@ -13,6 +13,8 @@ from ._exception import ( FBchatPleaseRefresh, ) +from typing import Iterable, Optional + #: Default list of user agents USER_AGENTS = [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36", @@ -24,6 +26,24 @@ USER_AGENTS = [ ] +def get_limits(limit: Optional[int], max_limit: int) -> Iterable[int]: + """Helper that generates limits based on a max limit.""" + if limit is None: + # Generate infinite items + while True: + yield max_limit + + if limit < 0: + raise ValueError("Limit cannot be negative") + + # Generate n items + yield from [max_limit] * (limit // max_limit) + + remainder = limit % max_limit + if remainder: + yield remainder + + def now(): return int(time.time() * 1000) From e76c6179fbb0a2579e71f89aba1f89b320cad558 Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Mon, 13 Jan 2020 15:54:09 +0100 Subject: [PATCH 2/7] Improve message searching in ThreadABC --- fbchat/_message.py | 28 +++++++++++++++++++++++++ fbchat/_thread.py | 52 +++++++++++++++++++++++++++++++--------------- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/fbchat/_message.py b/fbchat/_message.py index 58b2534..590dd03 100644 --- a/fbchat/_message.py +++ b/fbchat/_message.py @@ -170,6 +170,34 @@ class Message: return result, mentions +@attrs_default +class MessageSnippet(Message): + """Represents data in a Facebook message snippet. + + Inherits `Message`. + """ + + #: ID of the sender + author = attr.ib() + #: Datetime of when the message was sent + created_at = attr.ib() + #: The actual message + text = attr.ib() + #: A dict with offsets, mapped to the matched text + matched_keywords = attr.ib() + + @classmethod + def _parse(cls, thread, data): + return cls( + thread=thread, + id=data["message_id"], + author=data["author"].rstrip("fbid:"), + created_at=_util.millis_to_datetime(data["timestamp"]), + text=data["body"], + matched_keywords={int(k): v for k, v in data["matched_keywords"].items()}, + ) + + @attrs_default class MessageData(Message): """Represents data in a Facebook message. diff --git a/fbchat/_thread.py b/fbchat/_thread.py index 1bdc742..2f8a064 100644 --- a/fbchat/_thread.py +++ b/fbchat/_thread.py @@ -250,20 +250,9 @@ class ThreadABC(metaclass=abc.ABCMeta): # ) # return self.send(Message(text=payload, quick_replies=[new])) - def search_messages( - self, query: str, offset: int = 0, limit: int = 5 - ) -> Iterable[str]: - """Find and get message IDs by query. + def _search_messages(self, query, offset, limit): + from . import _message - Args: - query: Text to search for - offset (int): Number of messages to skip - limit (int): Max. number of messages to retrieve - - Returns: - typing.Iterable: Found Message IDs - """ - # TODO: Return proper searchable iterator data = { "query": query, "snippetOffset": offset, @@ -273,10 +262,39 @@ class ThreadABC(metaclass=abc.ABCMeta): } j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data) - result = j["search_snippets"][query] - snippets = result[self.id]["snippets"] if result.get(self.id) else [] - for snippet in snippets: - yield snippet["message_id"] + result = j["search_snippets"][query].get(self.id) + if not result: + return (0, []) + + # TODO: May or may not be a good idea to attach the current thread? + # For now, we just create a new thread: + thread = self.__class__(session=self.session, id=self.id) + snippets = [ + _message.MessageSnippet._parse(thread, snippet) + for snippet in result["snippets"] + ] + return (result["num_total_snippets"], snippets) + + def search_messages(self, query: str, limit: int) -> Iterable["MessageSnippet"]: + """Find and get message IDs by query. + + Warning! If someone send a message to the thread that matches the query, while + we're searching, some snippets will get returned twice. + + Not sure if we should handle it, Facebook's implementation doesn't... + + Args: + query: Text to search for + limit: Max. number of message snippets to retrieve + """ + offset = 0 + # The max limit is measured empirically to 420, safe default chosen below + for limit in _util.get_limits(limit, max_limit=50): + _, snippets = self._search_messages(query, offset, limit) + yield from snippets + if len(snippets) < limit: + return # No more data to fetch + offset += limit def fetch_messages(self, limit: int = 20, before: datetime.datetime = None): """Fetch messages in a thread, ordered by most recent. From 55182e21b622b2e8561aefe46ff592bd449caa4e Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Mon, 13 Jan 2020 16:54:34 +0100 Subject: [PATCH 3/7] Improve message searching in Client --- fbchat/_client.py | 83 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/fbchat/_client.py b/fbchat/_client.py index 278c1e4..8449291 100644 --- a/fbchat/_client.py +++ b/fbchat/_client.py @@ -3,7 +3,7 @@ import time import requests from ._core import log -from . import _util, _graphql, _session, _poll, _user +from . import _util, _graphql, _session, _poll, _user, _thread, _message from ._exception import FBchatException, FBchatFacebookError from ._thread import ThreadLocation @@ -24,7 +24,7 @@ from ._quick_reply import ( ) from ._plan import PlanData -from typing import Sequence +from typing import Sequence, Iterable, Tuple, Optional class Client: @@ -183,37 +183,68 @@ class Client: return rtn - def search(self, query, fetch_messages=False, thread_limit=5, message_limit=5): + def _search_messages(self, query, offset, limit): + data = {"query": query, "offset": offset, "limit": limit} + j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data) + + total_snippets = j["search_snippets"][query] + + rtn = [] + for node in j["graphql_payload"]["message_threads"]: + type_ = node["thread_type"] + if type_ == "GROUP": + thread = Group( + session=self.session, id=node["thread_key"]["thread_fbid"] + ) + elif type_ == "ONE_TO_ONE": + thread = _thread.Thread( + session=self.session, id=node["thread_key"]["other_user_id"] + ) + # if True: # TODO: This check! + # thread = UserData._from_graphql(self.session, node) + # else: + # thread = PageData._from_graphql(self.session, node) + else: + thread = None + log.warning("Unknown thread type %s, data: %s", type_, node) + + if thread: + rtn.append((thread, total_snippets[thread.id]["num_total_snippets"])) + else: + rtn.append((None, 0)) + + return rtn + + def search_messages( + self, query: str, limit: Optional[int] + ) -> Iterable[Tuple[_thread.ThreadABC, int]]: """Search for messages in all threads. + Intended to be used alongside `ThreadABC.search_messages` + + Warning! If someone send a message to a thread that matches the query, while + we're searching, some snippets will get returned twice. + + Not sure if we should handle it, Facebook's implementation doesn't... + Args: query: Text to search for - fetch_messages: Whether to fetch `Message` objects or IDs only - thread_limit (int): Max. number of threads to retrieve - message_limit (int): Max. number of messages to retrieve + limit: Max. number of threads to retrieve. If ``None``, all threads will be + retrieved. Returns: - typing.Dict[str, typing.Iterable]: Dictionary with thread IDs as keys and iterables to get messages as values - - Raises: - FBchatException: If request failed + Iterable with tuples of threads, and the total amount of matches. """ - data = {"query": query, "snippetLimit": thread_limit} - j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data) - result = j["search_snippets"][query] - - if not result: - return {} - - if fetch_messages: - search_method = self.search_for_messages - else: - search_method = self.search_for_message_ids - - return { - thread_id: search_method(query, limit=message_limit, thread_id=thread_id) - for thread_id in result - } + offset = 0 + # The max limit is measured empirically to ~500, safe default chosen below + for limit in _util.get_limits(limit, max_limit=100): + data = self._search_messages(query, offset, limit) + for thread, total_snippets in data: + if thread: + yield (thread, total_snippets) + if len(data) < limit: + return # No more data to fetch + offset += limit def _fetch_info(self, *ids): data = {"ids[{}]".format(i): _id for i, _id in enumerate(ids)} From 117433da8a765522a660e3f1ac161c306912b329 Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Tue, 14 Jan 2020 18:47:14 +0100 Subject: [PATCH 4/7] Improve image fetching in ThreadABC --- examples/fetch.py | 11 +++++---- fbchat/_file.py | 3 --- fbchat/_thread.py | 61 ++++++++++++++++++++++++++++------------------ tests/test_file.py | 4 +-- 4 files changed, 45 insertions(+), 34 deletions(-) diff --git a/examples/fetch.py b/examples/fetch.py index 30989ea..e4f4987 100644 --- a/examples/fetch.py +++ b/examples/fetch.py @@ -1,4 +1,3 @@ -import itertools import fbchat session = fbchat.Session.login("", "") @@ -62,7 +61,9 @@ print("thread's name: {}".format(thread.name)) # Here should be an example of `getUnread` -# Print image url for 20 last images from thread. -images = thread.fetch_images() -for image in itertools.islice(image, 20): - print(image.large_preview_url) +# Print image url for up to 20 last images from thread. +images = list(thread.fetch_images(limit=20)) +for image in images: + if isinstance(image, fbchat.ImageAttachment): + url = c.fetch_image_url(image.id) + print(url) diff --git a/fbchat/_file.py b/fbchat/_file.py index 5e68af4..a03a75d 100644 --- a/fbchat/_file.py +++ b/fbchat/_file.py @@ -91,8 +91,6 @@ class ImageAttachment(Attachment): @classmethod def _from_list(cls, data): - data = data["node"] - previews = { Image._from_uri_or_none(data["image"]), Image._from_uri(data["image1"]), @@ -156,7 +154,6 @@ class VideoAttachment(Attachment): @classmethod def _from_list(cls, data): - data = data["node"] previews = { Image._from_uri(data["image"]), Image._from_uri(data["image1"]), diff --git a/fbchat/_thread.py b/fbchat/_thread.py index 2f8a064..aa7a9ac 100644 --- a/fbchat/_thread.py +++ b/fbchat/_thread.py @@ -338,35 +338,48 @@ class ThreadABC(metaclass=abc.ABCMeta): return messages - def fetch_images(self): - """Fetch images/videos posted in the thread.""" - # TODO: Return proper searchable iterator - data = {"id": self.id, "first": 48} + def _fetch_images(self, limit, after): + data = {"id": self.id, "first": limit, "after": after} (j,) = self.session._graphql_requests( _graphql.from_query_id("515216185516880", data) ) - while True: - try: - i = j[self.id]["message_shared_media"]["edges"][0] - except IndexError: - if j[self.id]["message_shared_media"]["page_info"].get("has_next_page"): - data["after"] = j[self.id]["message_shared_media"]["page_info"].get( - "end_cursor" - ) - (j,) = self.session._graphql_requests( - _graphql.from_query_id("515216185516880", data) - ) - continue - else: - break - if i["node"].get("__typename") == "MessageImage": - yield _file.ImageAttachment._from_list(i) - elif i["node"].get("__typename") == "MessageVideo": - yield _file.VideoAttachment._from_list(i) + result = j[self.id]["message_shared_media"] + + print(len(result["edges"])) + + rtn = [] + for edge in result["edges"]: + node = edge["node"] + type_ = node["__typename"] + if type_ == "MessageImage": + rtn.append(_file.ImageAttachment._from_list(node)) + elif type_ == "MessageVideo": + rtn.append(_file.VideoAttachment._from_list(node)) else: - yield _attachment.Attachment(id=i["node"].get("legacy_attachment_id")) - del j[self.id]["message_shared_media"]["edges"][0] + log.warning("Unknown image type %s, data: %s", type_, edge) + rtn.append(None) + + # result["page_info"]["has_next_page"] is not correct when limit > 12 + return (result["page_info"]["end_cursor"], rtn) + + def fetch_images(self, limit: int) -> Iterable[_attachment.Attachment]: + """Fetch images/videos posted in the thread. + + Args: + limit: Max. number of images to retrieve. If ``None``, all images will be + retrieved. + """ + cursor = None + # The max limit on this request is unknown, so we set it reasonably high + # This way `limit=None` also still works + for limit in _util.get_limits(limit, max_limit=1000): + cursor, images = self._fetch_images(limit, cursor) + if not images: + return # No more data to fetch + for image in images: + if image: + yield image def set_nickname(self, user_id: str, nickname: str): """Change the nickname of a user in the thread. diff --git a/tests/test_file.py b/tests/test_file.py index c40de05..730a6ca 100644 --- a/tests/test_file.py +++ b/tests/test_file.py @@ -46,7 +46,7 @@ def test_imageattachment_from_list(): height=988, ), }, - ) == ImageAttachment._from_list({"node": data}) + ) == ImageAttachment._from_list(data) def test_videoattachment_from_list(): @@ -88,7 +88,7 @@ def test_videoattachment_from_list(): height=368, ), }, - ) == VideoAttachment._from_list({"node": data}) + ) == VideoAttachment._from_list(data) def test_graphql_to_attachment_empty(): From 60cce0d11276710f5a2e848fcb3068ef729f9d3a Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Tue, 14 Jan 2020 21:28:54 +0100 Subject: [PATCH 5/7] Refactor Client.fetch_thread_list to return an iterable --- fbchat/_client.py | 71 ++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/fbchat/_client.py b/fbchat/_client.py index 8449291..d2fe888 100644 --- a/fbchat/_client.py +++ b/fbchat/_client.py @@ -348,33 +348,10 @@ class Client: return rtn - def fetch_thread_list( - self, limit=20, thread_location=ThreadLocation.INBOX, before=None - ): - """Fetch the client's thread list. - - Args: - limit (int): Max. number of threads to retrieve. Capped at 20 - thread_location (ThreadLocation): INBOX, PENDING, ARCHIVED or OTHER - before (datetime.datetime): The point from which to retrieve threads - - Returns: - list: `Thread` objects - - Raises: - FBchatException: If request failed - """ - if limit > 20 or limit < 1: - raise ValueError("`limit` should be between 1 and 20") - - if thread_location in ThreadLocation: - loc_str = thread_location.value - else: - raise TypeError('"thread_location" must be a value of ThreadLocation') - + def _fetch_threads(self, limit, before, folders): params = { "limit": limit, - "tags": [loc_str], + "tags": folders, "before": _util.datetime_to_millis(before) if before else None, "includeDeliveryReceipts": True, "includeSeqID": False, @@ -389,15 +366,47 @@ class Client: if _type == "GROUP": rtn.append(GroupData._from_graphql(self.session, node)) elif _type == "ONE_TO_ONE": - user = UserData._from_thread_fetch(self.session, node) - if user: - rtn.append(user) + rtn.append(UserData._from_thread_fetch(self.session, node)) else: - raise FBchatException( - "Unknown thread type: {}, with data: {}".format(_type, node) - ) + rtn.append(None) + log.warning("Unknown thread type: %s, data: %s", _type, node) return rtn + def fetch_threads( + self, limit: Optional[int], location: ThreadLocation = ThreadLocation.INBOX, + ) -> Iterable[_thread.ThreadABC]: + """Fetch the client's thread list. + + Args: + limit: Max. number of threads to retrieve. If ``None``, all threads will be + retrieved. + location: INBOX, PENDING, ARCHIVED or OTHER + """ + # This is measured empirically as 837, safe default chosen below + MAX_BATCH_LIMIT = 100 + + # TODO: Clean this up after implementing support for more threads types + seen_ids = set() + before = None + for limit in _util.get_limits(limit, MAX_BATCH_LIMIT): + threads = self._fetch_threads(limit, before, [location.value]) + + before = None + for thread in threads: + # Don't return seen and unknown threads + if thread and thread.id not in seen_ids: + seen_ids.add(thread.id) + # TODO: Ensure type-wise that .last_active is available + before = thread.last_active + yield thread + + if len(threads) < MAX_BATCH_LIMIT: + return # No more data to fetch + + # We check this here in case _fetch_threads only returned `None` threads + if not before: + raise ValueError("Too many unknown threads.") + def fetch_unread(self): """Fetch unread threads. From 22dcf6d69a494433c70d5e58f445477ef5d751da Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Tue, 14 Jan 2020 21:53:55 +0100 Subject: [PATCH 6/7] Update ThreadABC.fetch_messages --- fbchat/_thread.py | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/fbchat/_thread.py b/fbchat/_thread.py index aa7a9ac..923f7e7 100644 --- a/fbchat/_thread.py +++ b/fbchat/_thread.py @@ -296,28 +296,20 @@ class ThreadABC(metaclass=abc.ABCMeta): return # No more data to fetch offset += limit - def fetch_messages(self, limit: int = 20, before: datetime.datetime = None): - """Fetch messages in a thread, ordered by most recent. - - Args: - limit: Max. number of messages to retrieve - before: The point from which to retrieve messages - - Returns: - list: `Message` objects - """ + def _fetch_messages(self, limit, before): from . import _message - # TODO: Return proper searchable iterator params = { "id": self.id, "message_limit": limit, "load_messages": True, "load_read_receipts": True, + # "load_delivery_receipts": False, + # "is_work_teamwork_not_putting_muted_in_unreads": False, "before": _util.datetime_to_millis(before) if before else None, } (j,) = self.session._graphql_requests( - _graphql.from_doc_id("1860982147341344", params) + _graphql.from_doc_id("1860982147341344", params) # 2696825200377124 ) if j.get("message_thread") is None: @@ -325,18 +317,43 @@ class ThreadABC(metaclass=abc.ABCMeta): "Could not fetch thread {}: {}".format(self.id, j) ) + # TODO: Should we parse the returned thread data, too? + read_receipts = j["message_thread"]["read_receipts"]["nodes"] # TODO: May or may not be a good idea to attach the current thread? # For now, we just create a new thread: thread = self.__class__(session=self.session, id=self.id) - messages = [ + return [ _message.MessageData._from_graphql(thread, message, read_receipts) for message in j["message_thread"]["messages"]["nodes"] ] - messages.reverse() - return messages + def fetch_messages(self, limit: Optional[int]) -> Iterable["_message.Message"]: + """Fetch messages in a thread, with most recent messages first. + + Args: + limit: Max. number of threads to retrieve. If ``None``, all threads will be + retrieved. + """ + # This is measured empirically as 210 in extreme cases, fairly safe default + # chosen below + MAX_BATCH_LIMIT = 100 + + before = None + for limit in _util.get_limits(limit, MAX_BATCH_LIMIT): + messages = self._fetch_messages(limit, before) + + if before: + # Strip the first thread + yield from messages[1:] + else: + yield from messages + + if len(messages) < MAX_BATCH_LIMIT: + return # No more data to fetch + + before = messages[-1].created_at def _fetch_images(self, limit, after): data = {"id": self.id, "first": limit, "after": after} From 2b45fdbc8ab25d38acb894ac9474e7794ac1472f Mon Sep 17 00:00:00 2001 From: Mads Marquart Date: Tue, 14 Jan 2020 22:06:12 +0100 Subject: [PATCH 7/7] Make Client.search_for_X more forwards compatible --- fbchat/_client.py | 55 ++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/fbchat/_client.py b/fbchat/_client.py index d2fe888..5f1c999 100644 --- a/fbchat/_client.py +++ b/fbchat/_client.py @@ -3,7 +3,7 @@ import time import requests from ._core import log -from . import _util, _graphql, _session, _poll, _user, _thread, _message +from . import _util, _graphql, _session, _poll, _user, _page, _group, _thread, _message from ._exception import FBchatException, FBchatFacebookError from ._thread import ThreadLocation @@ -78,7 +78,7 @@ class Client: users.append(_user.UserData._from_all_fetch(self.session, data)) return users - def search_for_users(self, name, limit=10): + def search_for_users(self, name: str, limit: int) -> Iterable[_user.UserData]: """Find and get users by their name. Args: @@ -87,92 +87,71 @@ class Client: Returns: list: `User` objects, ordered by relevance - - Raises: - FBchatException: If request failed """ params = {"search": name, "limit": limit} (j,) = self.session._graphql_requests( _graphql.from_query(_graphql.SEARCH_USER, params) ) - return [ + return ( UserData._from_graphql(self.session, node) for node in j[name]["users"]["nodes"] - ] + ) - def search_for_pages(self, name, limit=10): + def search_for_pages(self, name: str, limit: int) -> Iterable[_page.PageData]: """Find and get pages by their name. Args: name: Name of the page - - Returns: - list: `Page` objects, ordered by relevance - - Raises: - FBchatException: If request failed + limit: The max. amount of pages to fetch """ params = {"search": name, "limit": limit} (j,) = self.session._graphql_requests( _graphql.from_query(_graphql.SEARCH_PAGE, params) ) - return [ + return ( PageData._from_graphql(self.session, node) for node in j[name]["pages"]["nodes"] - ] + ) - def search_for_groups(self, name, limit=10): + def search_for_groups(self, name: str, limit: int) -> Iterable[_group.GroupData]: """Find and get group threads by their name. Args: name: Name of the group thread limit: The max. amount of groups to fetch - - Returns: - list: `Group` objects, ordered by relevance - - Raises: - FBchatException: If request failed """ params = {"search": name, "limit": limit} (j,) = self.session._graphql_requests( _graphql.from_query(_graphql.SEARCH_GROUP, params) ) - return [ + return ( GroupData._from_graphql(self.session, node) for node in j["viewer"]["groups"]["nodes"] - ] + ) - def search_for_threads(self, name, limit=10): + def search_for_threads(self, name: str, limit: int) -> Iterable[_thread.ThreadABC]: """Find and get threads by their name. Args: name: Name of the thread - limit: The max. amount of groups to fetch - - Returns: - list: `User`, `Group` and `Page` objects, ordered by relevance - - Raises: - FBchatException: If request failed + limit: The max. amount of threads to fetch """ params = {"search": name, "limit": limit} (j,) = self.session._graphql_requests( _graphql.from_query(_graphql.SEARCH_THREAD, params) ) - rtn = [] for node in j[name]["threads"]["nodes"]: if node["__typename"] == "User": - rtn.append(UserData._from_graphql(self.session, node)) + yield UserData._from_graphql(self.session, node) elif node["__typename"] == "MessageThread": # MessageThread => Group thread - rtn.append(GroupData._from_graphql(self.session, node)) + yield GroupData._from_graphql(self.session, node) elif node["__typename"] == "Page": - rtn.append(PageData._from_graphql(self.session, node)) + yield PageData._from_graphql(self.session, node) elif node["__typename"] == "Group": # We don't handle Facebook "Groups" pass @@ -181,8 +160,6 @@ class Client: "Unknown type {} in {}".format(repr(node["__typename"]), node) ) - return rtn - def _search_messages(self, query, offset, limit): data = {"query": query, "offset": offset, "limit": limit} j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data)