diff --git a/docs/tff_for_research.md b/docs/tff_for_research.md index b515aed2d9a70c960d661fcf4003ca773e5de1b5..6beaf0c9e28cd67396ef19371ad289592cb0a4e9 100644 --- a/docs/tff_for_research.md +++ b/docs/tff_for_research.md @@ -41,7 +41,13 @@ types of logic. TensorFlow federated [hosts multiple datasets](https://www.tensorflow.org/federated/api_docs/python/tff/simulation/datasets) that are representative of the characteristics of real-world problems that could -be solved with federated learning. Datasets include: +be solved with federated learning. + +Note: These datasets can also be consumed by any Python-based ML framework as +Numpy arrays, as documented in the +[ClientData API](https://www.tensorflow.org/federated/api_docs/python/tff/simulation/ClientData). + +Datasets include: * [**StackOverflow**.](https://www.tensorflow.org/federated/api_docs/python/tff/simulation/datasets/stackoverflow/load_data) A realistic text dataset for language modeling or supervised learning tasks, diff --git a/tensorflow_federated/python/simulation/client_data.py b/tensorflow_federated/python/simulation/client_data.py index 0586f0853cc9b624d25c686f1dcff9919168ec8f..66dd89dd3324641bb6d6fc0a536e5ae2daa074dc 100644 --- a/tensorflow_federated/python/simulation/client_data.py +++ b/tensorflow_federated/python/simulation/client_data.py @@ -11,14 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Library methods for working with centralized data used in simulation. - -N.B. Federated Learning does not use client IDs or perform any tracking of -clients. However in simulation experiments using centralized test data the -experimenter may select specific clients to be processed per round. The concept -of a client ID is only available at the preprocessing stage when preparing input -data for the simulation and is not part of the TensorFlow Federated core APIs. -""" +"""Library methods for working with centralized data used in simulation.""" import abc import collections @@ -33,7 +26,35 @@ from tensorflow_federated.python.tensorflow_libs import version_check class ClientData(object, metaclass=abc.ABCMeta): - """Object to hold a dataset and a mapping of clients to examples.""" + """Object to hold a federated dataset. + + The federated dataset is represented as a list of client ids, and + a function to look up the local dataset for each client id. + + Note: Cross-device federated learning does not use client IDs or perform any + tracking of clients. However in simulation experiments using centralized test + data the experimenter may select specific clients to be processed per round. + The concept of a client ID is only available at the preprocessing stage when + preparing input data for the simulation and is not part of the TensorFlow + Federated core APIs. + + Each client's local dataset is represented as a `tf.data.Dataset`, but + generally this class (and the corresponding datasets hosted by TFF) can + easily be consumed by any Python-based ML framework as `numpy` arrays: + + ```python + import tensorflow as tf + import tensorflow_federated as tff + import tensorflow_datasets as tfds + + for client_id in sampled_client_ids[:5]: + client_local_dataset = tfds.as_numpy( + emnist_train.create_tf_dataset_for_client(client_id)) + # client_local_dataset is an iterable of structures of numpy arrays + for example in client_local_dataset: + print(example) + ``` + """ @abc.abstractproperty def client_ids(self) -> List[str]: