Merge pull request #507 from carpedm20/refactor-limits

Refactor method limits
2020-01-14 22:13:58 +01:00
parent 3c35770eca 2b45fdbc8a
commit 656281eacb
7 changed files with 267 additions and 156 deletions
--- a/examples/fetch.py
+++ b/examples/fetch.py
@@ -1,4 +1,3 @@
 import itertools
 import fbchat
 session = fbchat.Session.login("<email>", "<password>")
@@ -62,7 +61,9 @@ print("thread's name: {}".format(thread.name))
 # Here should be an example of `getUnread`
-# Print image url for 20 last images from thread.
+# Print image url for up to 20 last images from thread.
-images = thread.fetch_images()
+images = list(thread.fetch_images(limit=20))
-for image in itertools.islice(image, 20):
+for image in images:
-    print(image.large_preview_url)
+    if isinstance(image, fbchat.ImageAttachment):
        url = c.fetch_image_url(image.id)
        print(url)
--- a/fbchat/_client.py
+++ b/fbchat/_client.py
@@ -3,7 +3,7 @@ import time
 import requests
 from ._core import log
-from . import _util, _graphql, _session, _poll, _user
+from . import _util, _graphql, _session, _poll, _user, _page, _group, _thread, _message
 from ._exception import FBchatException, FBchatFacebookError
 from ._thread import ThreadLocation
@@ -24,7 +24,7 @@ from ._quick_reply import (
 )
 from ._plan import PlanData
-from typing import Sequence
+from typing import Sequence, Iterable, Tuple, Optional
 class Client:
@@ -78,7 +78,7 @@ class Client:
            users.append(_user.UserData._from_all_fetch(self.session, data))
        return users
-    def search_for_users(self, name, limit=10):
+    def search_for_users(self, name: str, limit: int) -> Iterable[_user.UserData]:
        """Find and get users by their name.
        Args:
@@ -87,92 +87,71 @@ class Client:
        Returns:
            list: `User` objects, ordered by relevance
        Raises:
            FBchatException: If request failed
        """
        params = {"search": name, "limit": limit}
        (j,) = self.session._graphql_requests(
            _graphql.from_query(_graphql.SEARCH_USER, params)
        )
-        return [
+        return (
            UserData._from_graphql(self.session, node)
            for node in j[name]["users"]["nodes"]
-        ]
+        )
-    def search_for_pages(self, name, limit=10):
+    def search_for_pages(self, name: str, limit: int) -> Iterable[_page.PageData]:
        """Find and get pages by their name.
        Args:
            name: Name of the page
-
+            limit: The max. amount of pages to fetch
        Returns:
            list: `Page` objects, ordered by relevance
        Raises:
            FBchatException: If request failed
        """
        params = {"search": name, "limit": limit}
        (j,) = self.session._graphql_requests(
            _graphql.from_query(_graphql.SEARCH_PAGE, params)
        )
-        return [
+        return (
            PageData._from_graphql(self.session, node)
            for node in j[name]["pages"]["nodes"]
-        ]
+        )
-    def search_for_groups(self, name, limit=10):
+    def search_for_groups(self, name: str, limit: int) -> Iterable[_group.GroupData]:
        """Find and get group threads by their name.
        Args:
            name: Name of the group thread
            limit: The max. amount of groups to fetch
        Returns:
            list: `Group` objects, ordered by relevance
        Raises:
            FBchatException: If request failed
        """
        params = {"search": name, "limit": limit}
        (j,) = self.session._graphql_requests(
            _graphql.from_query(_graphql.SEARCH_GROUP, params)
        )
-        return [
+        return (
            GroupData._from_graphql(self.session, node)
            for node in j["viewer"]["groups"]["nodes"]
-        ]
+        )
-    def search_for_threads(self, name, limit=10):
+    def search_for_threads(self, name: str, limit: int) -> Iterable[_thread.ThreadABC]:
        """Find and get threads by their name.
        Args:
            name: Name of the thread
-            limit: The max. amount of groups to fetch
+            limit: The max. amount of threads to fetch
        Returns:
            list: `User`, `Group` and `Page` objects, ordered by relevance
        Raises:
            FBchatException: If request failed
        """
        params = {"search": name, "limit": limit}
        (j,) = self.session._graphql_requests(
            _graphql.from_query(_graphql.SEARCH_THREAD, params)
        )
        rtn = []
        for node in j[name]["threads"]["nodes"]:
            if node["__typename"] == "User":
-                rtn.append(UserData._from_graphql(self.session, node))
+                yield UserData._from_graphql(self.session, node)
            elif node["__typename"] == "MessageThread":
                # MessageThread => Group thread
-                rtn.append(GroupData._from_graphql(self.session, node))
+                yield GroupData._from_graphql(self.session, node)
            elif node["__typename"] == "Page":
-                rtn.append(PageData._from_graphql(self.session, node))
+                yield PageData._from_graphql(self.session, node)
            elif node["__typename"] == "Group":
                # We don't handle Facebook "Groups"
                pass
@@ -181,39 +160,68 @@ class Client:
                    "Unknown type {} in {}".format(repr(node["__typename"]), node)
                )
    def _search_messages(self, query, offset, limit):
        data = {"query": query, "offset": offset, "limit": limit}
        j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data)
        total_snippets = j["search_snippets"][query]
        rtn = []
        for node in j["graphql_payload"]["message_threads"]:
            type_ = node["thread_type"]
            if type_ == "GROUP":
                thread = Group(
                    session=self.session, id=node["thread_key"]["thread_fbid"]
                )
            elif type_ == "ONE_TO_ONE":
                thread = _thread.Thread(
                    session=self.session, id=node["thread_key"]["other_user_id"]
                )
                # if True:  # TODO: This check!
                #     thread = UserData._from_graphql(self.session, node)
                # else:
                #     thread = PageData._from_graphql(self.session, node)
            else:
                thread = None
                log.warning("Unknown thread type %s, data: %s", type_, node)
            if thread:
                rtn.append((thread, total_snippets[thread.id]["num_total_snippets"]))
            else:
                rtn.append((None, 0))
        return rtn
-    def search(self, query, fetch_messages=False, thread_limit=5, message_limit=5):
+    def search_messages(
        self, query: str, limit: Optional[int]
    ) -> Iterable[Tuple[_thread.ThreadABC, int]]:
        """Search for messages in all threads.
        Intended to be used alongside `ThreadABC.search_messages`
        Warning! If someone send a message to a thread that matches the query, while
        we're searching, some snippets will get returned twice.
        Not sure if we should handle it, Facebook's implementation doesn't...
        Args:
            query: Text to search for
-            fetch_messages: Whether to fetch `Message` objects or IDs only
+            limit: Max. number of threads to retrieve. If ``None``, all threads will be
-            thread_limit (int): Max. number of threads to retrieve
+                retrieved.
            message_limit (int): Max. number of messages to retrieve
        Returns:
-            typing.Dict[str, typing.Iterable]: Dictionary with thread IDs as keys and iterables to get messages as values
+            Iterable with tuples of threads, and the total amount of matches.
        Raises:
            FBchatException: If request failed
        """
-        data = {"query": query, "snippetLimit": thread_limit}
+        offset = 0
-        j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data)
+        # The max limit is measured empirically to ~500, safe default chosen below
-        result = j["search_snippets"][query]
+        for limit in _util.get_limits(limit, max_limit=100):
-
+            data = self._search_messages(query, offset, limit)
-        if not result:
+            for thread, total_snippets in data:
-            return {}
+                if thread:
-
+                    yield (thread, total_snippets)
-        if fetch_messages:
+            if len(data) < limit:
-            search_method = self.search_for_messages
+                return  # No more data to fetch
-        else:
+            offset += limit
            search_method = self.search_for_message_ids
        return {
            thread_id: search_method(query, limit=message_limit, thread_id=thread_id)
            for thread_id in result
        }
    def _fetch_info(self, *ids):
        data = {"ids[{}]".format(i): _id for i, _id in enumerate(ids)}
@@ -317,33 +325,10 @@ class Client:
        return rtn
-    def fetch_thread_list(
+    def _fetch_threads(self, limit, before, folders):
        self, limit=20, thread_location=ThreadLocation.INBOX, before=None
    ):
        """Fetch the client's thread list.
        Args:
            limit (int): Max. number of threads to retrieve. Capped at 20
            thread_location (ThreadLocation): INBOX, PENDING, ARCHIVED or OTHER
            before (datetime.datetime): The point from which to retrieve threads
        Returns:
            list: `Thread` objects
        Raises:
            FBchatException: If request failed
        """
        if limit > 20 or limit < 1:
            raise ValueError("`limit` should be between 1 and 20")
        if thread_location in ThreadLocation:
            loc_str = thread_location.value
        else:
            raise TypeError('"thread_location" must be a value of ThreadLocation')
        params = {
            "limit": limit,
-            "tags": [loc_str],
+            "tags": folders,
            "before": _util.datetime_to_millis(before) if before else None,
            "includeDeliveryReceipts": True,
            "includeSeqID": False,
@@ -358,15 +343,47 @@ class Client:
            if _type == "GROUP":
                rtn.append(GroupData._from_graphql(self.session, node))
            elif _type == "ONE_TO_ONE":
-                user = UserData._from_thread_fetch(self.session, node)
+                rtn.append(UserData._from_thread_fetch(self.session, node))
                if user:
                    rtn.append(user)
            else:
-                raise FBchatException(
+                rtn.append(None)
-                    "Unknown thread type: {}, with data: {}".format(_type, node)
+                log.warning("Unknown thread type: %s, data: %s", _type, node)
                )
        return rtn
    def fetch_threads(
        self, limit: Optional[int], location: ThreadLocation = ThreadLocation.INBOX,
    ) -> Iterable[_thread.ThreadABC]:
        """Fetch the client's thread list.
        Args:
            limit: Max. number of threads to retrieve. If ``None``, all threads will be
                retrieved.
            location: INBOX, PENDING, ARCHIVED or OTHER
        """
        # This is measured empirically as 837, safe default chosen below
        MAX_BATCH_LIMIT = 100
        # TODO: Clean this up after implementing support for more threads types
        seen_ids = set()
        before = None
        for limit in _util.get_limits(limit, MAX_BATCH_LIMIT):
            threads = self._fetch_threads(limit, before, [location.value])
            before = None
            for thread in threads:
                # Don't return seen and unknown threads
                if thread and thread.id not in seen_ids:
                    seen_ids.add(thread.id)
                    # TODO: Ensure type-wise that .last_active is available
                    before = thread.last_active
                    yield thread
            if len(threads) < MAX_BATCH_LIMIT:
                return  # No more data to fetch
            # We check this here in case _fetch_threads only returned `None` threads
            if not before:
                raise ValueError("Too many unknown threads.")
    def fetch_unread(self):
        """Fetch unread threads.
--- a/fbchat/_file.py
+++ b/fbchat/_file.py
@@ -91,8 +91,6 @@ class ImageAttachment(Attachment):
    @classmethod
    def _from_list(cls, data):
        data = data["node"]
        previews = {
            Image._from_uri_or_none(data["image"]),
            Image._from_uri(data["image1"]),
@@ -156,7 +154,6 @@ class VideoAttachment(Attachment):
    @classmethod
    def _from_list(cls, data):
        data = data["node"]
        previews = {
            Image._from_uri(data["image"]),
            Image._from_uri(data["image1"]),
--- a/fbchat/_message.py
+++ b/fbchat/_message.py
@@ -170,6 +170,34 @@ class Message:
        return result, mentions
@attrs_default
 class MessageSnippet(Message):
    """Represents data in a Facebook message snippet.
    Inherits `Message`.
    """
    #: ID of the sender
    author = attr.ib()
    #: Datetime of when the message was sent
    created_at = attr.ib()
    #: The actual message
    text = attr.ib()
    #: A dict with offsets, mapped to the matched text
    matched_keywords = attr.ib()
    @classmethod
    def _parse(cls, thread, data):
        return cls(
            thread=thread,
            id=data["message_id"],
            author=data["author"].rstrip("fbid:"),
            created_at=_util.millis_to_datetime(data["timestamp"]),
            text=data["body"],
            matched_keywords={int(k): v for k, v in data["matched_keywords"].items()},
        )
@attrs_default
 class MessageData(Message):
    """Represents data in a Facebook message.
--- a/fbchat/_thread.py
+++ b/fbchat/_thread.py
@@ -250,20 +250,9 @@ class ThreadABC(metaclass=abc.ABCMeta):
    #         )
    #         return self.send(Message(text=payload, quick_replies=[new]))
-    def search_messages(
+    def _search_messages(self, query, offset, limit):
-        self, query: str, offset: int = 0, limit: int = 5
+        from . import _message
    ) -> Iterable[str]:
        """Find and get message IDs by query.
        Args:
            query: Text to search for
            offset (int): Number of messages to skip
            limit (int): Max. number of messages to retrieve
        Returns:
            typing.Iterable: Found Message IDs
        """
        # TODO: Return proper searchable iterator
        data = {
            "query": query,
            "snippetOffset": offset,
@@ -273,33 +262,54 @@ class ThreadABC(metaclass=abc.ABCMeta):
        }
        j = self.session._payload_post("/ajax/mercury/search_snippets.php?dpr=1", data)
-        result = j["search_snippets"][query]
+        result = j["search_snippets"][query].get(self.id)
-        snippets = result[self.id]["snippets"] if result.get(self.id) else []
+        if not result:
-        for snippet in snippets:
+            return (0, [])
            yield snippet["message_id"]
-    def fetch_messages(self, limit: int = 20, before: datetime.datetime = None):
+        # TODO: May or may not be a good idea to attach the current thread?
-        """Fetch messages in a thread, ordered by most recent.
+        # For now, we just create a new thread:
        thread = self.__class__(session=self.session, id=self.id)
        snippets = [
            _message.MessageSnippet._parse(thread, snippet)
            for snippet in result["snippets"]
        ]
        return (result["num_total_snippets"], snippets)
    def search_messages(self, query: str, limit: int) -> Iterable["MessageSnippet"]:
        """Find and get message IDs by query.
        Warning! If someone send a message to the thread that matches the query, while
        we're searching, some snippets will get returned twice.
        Not sure if we should handle it, Facebook's implementation doesn't...
        Args:
-            limit: Max. number of messages to retrieve
+            query: Text to search for
-            before: The point from which to retrieve messages
+            limit: Max. number of message snippets to retrieve
        Returns:
            list: `Message` objects
        """
        offset = 0
        # The max limit is measured empirically to 420, safe default chosen below
        for limit in _util.get_limits(limit, max_limit=50):
            _, snippets = self._search_messages(query, offset, limit)
            yield from snippets
            if len(snippets) < limit:
                return  # No more data to fetch
            offset += limit
    def _fetch_messages(self, limit, before):
        from . import _message
        # TODO: Return proper searchable iterator
        params = {
            "id": self.id,
            "message_limit": limit,
            "load_messages": True,
            "load_read_receipts": True,
            # "load_delivery_receipts": False,
            # "is_work_teamwork_not_putting_muted_in_unreads": False,
            "before": _util.datetime_to_millis(before) if before else None,
        }
        (j,) = self.session._graphql_requests(
-            _graphql.from_doc_id("1860982147341344", params)
+            _graphql.from_doc_id("1860982147341344", params)  # 2696825200377124
        )
        if j.get("message_thread") is None:
@@ -307,48 +317,86 @@ class ThreadABC(metaclass=abc.ABCMeta):
                "Could not fetch thread {}: {}".format(self.id, j)
            )
        # TODO: Should we parse the returned thread data, too?
        read_receipts = j["message_thread"]["read_receipts"]["nodes"]
        # TODO: May or may not be a good idea to attach the current thread?
        # For now, we just create a new thread:
        thread = self.__class__(session=self.session, id=self.id)
-        messages = [
+        return [
            _message.MessageData._from_graphql(thread, message, read_receipts)
            for message in j["message_thread"]["messages"]["nodes"]
        ]
        messages.reverse()
-        return messages
+    def fetch_messages(self, limit: Optional[int]) -> Iterable["_message.Message"]:
        """Fetch messages in a thread, with most recent messages first.
-    def fetch_images(self):
+        Args:
-        """Fetch images/videos posted in the thread."""
+            limit: Max. number of threads to retrieve. If ``None``, all threads will be
-        # TODO: Return proper searchable iterator
+                retrieved.
-        data = {"id": self.id, "first": 48}
+        """
        # This is measured empirically as 210 in extreme cases, fairly safe default
        # chosen below
        MAX_BATCH_LIMIT = 100
        before = None
        for limit in _util.get_limits(limit, MAX_BATCH_LIMIT):
            messages = self._fetch_messages(limit, before)
            if before:
                # Strip the first thread
                yield from messages[1:]
            else:
                yield from messages
            if len(messages) < MAX_BATCH_LIMIT:
                return  # No more data to fetch
            before = messages[-1].created_at
    def _fetch_images(self, limit, after):
        data = {"id": self.id, "first": limit, "after": after}
        (j,) = self.session._graphql_requests(
            _graphql.from_query_id("515216185516880", data)
        )
        while True:
            try:
                i = j[self.id]["message_shared_media"]["edges"][0]
            except IndexError:
                if j[self.id]["message_shared_media"]["page_info"].get("has_next_page"):
                    data["after"] = j[self.id]["message_shared_media"]["page_info"].get(
                        "end_cursor"
                    )
                    (j,) = self.session._graphql_requests(
                        _graphql.from_query_id("515216185516880", data)
                    )
                    continue
                else:
                    break
-            if i["node"].get("__typename") == "MessageImage":
+        result = j[self.id]["message_shared_media"]
-                yield _file.ImageAttachment._from_list(i)
+
-            elif i["node"].get("__typename") == "MessageVideo":
+        print(len(result["edges"]))
-                yield _file.VideoAttachment._from_list(i)
+
        rtn = []
        for edge in result["edges"]:
            node = edge["node"]
            type_ = node["__typename"]
            if type_ == "MessageImage":
                rtn.append(_file.ImageAttachment._from_list(node))
            elif type_ == "MessageVideo":
                rtn.append(_file.VideoAttachment._from_list(node))
            else:
-                yield _attachment.Attachment(id=i["node"].get("legacy_attachment_id"))
+                log.warning("Unknown image type %s, data: %s", type_, edge)
-            del j[self.id]["message_shared_media"]["edges"][0]
+                rtn.append(None)
        # result["page_info"]["has_next_page"] is not correct when limit > 12
        return (result["page_info"]["end_cursor"], rtn)
    def fetch_images(self, limit: int) -> Iterable[_attachment.Attachment]:
        """Fetch images/videos posted in the thread.
        Args:
            limit: Max. number of images to retrieve. If ``None``, all images will be
                retrieved.
        """
        cursor = None
        # The max limit on this request is unknown, so we set it reasonably high
        # This way `limit=None` also still works
        for limit in _util.get_limits(limit, max_limit=1000):
            cursor, images = self._fetch_images(limit, cursor)
            if not images:
                return  # No more data to fetch
            for image in images:
                if image:
                    yield image
    def set_nickname(self, user_id: str, nickname: str):
        """Change the nickname of a user in the thread.
--- a/fbchat/_util.py
+++ b/fbchat/_util.py
@@ -13,6 +13,8 @@ from ._exception import (
    FBchatPleaseRefresh,
 )
 from typing import Iterable, Optional
 #: Default list of user agents
 USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36",
@@ -24,6 +26,24 @@ USER_AGENTS = [
 ]
 def get_limits(limit: Optional[int], max_limit: int) -> Iterable[int]:
    """Helper that generates limits based on a max limit."""
    if limit is None:
        # Generate infinite items
        while True:
            yield max_limit
    if limit < 0:
        raise ValueError("Limit cannot be negative")
    # Generate n items
    yield from [max_limit] * (limit // max_limit)
    remainder = limit % max_limit
    if remainder:
        yield remainder
 def now():
    return int(time.time() * 1000)
--- a/tests/test_file.py
+++ b/tests/test_file.py
@@ -46,7 +46,7 @@ def test_imageattachment_from_list():
                height=988,
            ),
        },
-    ) == ImageAttachment._from_list({"node": data})
+    ) == ImageAttachment._from_list(data)
 def test_videoattachment_from_list():
@@ -88,7 +88,7 @@ def test_videoattachment_from_list():
                height=368,
            ),
        },
-    ) == VideoAttachment._from_list({"node": data})
+    ) == VideoAttachment._from_list(data)
 def test_graphql_to_attachment_empty():