CLUSTER BY Clause

    1. CREATE TABLE person (name STRING, age INT);
    2. INSERT INTO person VALUES
    3. ('Zen Hui', 25),
    4. ('Anil B', 18),
    5. ('Shone S', 16),
    6. ('Mike A', 25),
    7. ('John A', 18),
    8. ('Jack N', 16);
    9. -- Reduce the number of shuffle partitions to 2 to illustrate the behavior of `CLUSTER BY`.
    10. -- It's easier to see the clustering and sorting behavior with less number of partitions.
    11. SET spark.sql.shuffle.partitions = 2;
    12. -- Select the rows with no ordering. Please note that without any sort directive, the results
    13. -- of a query when `CLUSTER BY` is not used vs when it's used. The query below produces rows
    14. -- where age column is not sorted.
    15. SELECT age, name FROM person;
    16. +---+-------+
    17. |age| name|
    18. +---+-------+
    19. | 16|Shone S|
    20. | 25|Zen Hui|
    21. | 16| Jack N|
    22. | 25| Mike A|
    23. | 18| John A|
    24. | 18| Anil B|
    25. +---+-------+
    26. -- In the query below, persons with age 18 and 25 are in first partition and the
    27. -- persons with age 16 are in the second partition. The rows are sorted based
    28. -- on age within each partition.
    29. SELECT age, name FROM person CLUSTER BY age;
    30. +---+-------+
    31. +---+-------+
    32. | 18| John A|
    33. | 18| Anil B|
    34. | 25|Zen Hui|
    35. | 25| Mike A|
    36. | 16|Shone S|
    37. | 16| Jack N|
    38. +---+-------+