diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 92737111fb8a882fadb31967d44da31cd5e12496..7228afcbc6f6fddf54db5ffabf36759d6ce95c09 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,51 @@
+2017-02-09  Nathan Sidwell  <nathan@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* gimplify.c (gimplify_scan_omp_clauses): No special handling for
+	OMP_CLAUSE_TILE.
+	(gimplify_adjust_omp_clauses): Don't delete TILE.
+	(gimplify_omp_for): Deal with TILE.
+	* internal-fn.c (expand_GOACC_TILE): New function.
+	* internal-fn.def (GOACC_DIM_POS): Comment may be overly conservative.
+	(GOACC_TILE): New.
+	* omp-expand.c (struct oacc_collapse): Add tile and outer fields.
+	(expand_oacc_collapse_init): Add LOC paramter.  Initialize tile
+	element fields.
+	(expand_oacc_collapse_vars): Add INNER parm, adjust for tiling,
+	avoid DIV for outermost collapse var.
+	(expand_oacc_for): Insert tile element loop as needed.  Adjust.
+	Remove out of date comments, fix whitespace.
+	* omp-general.c (omp_extract_for_data): Deal with tiling.
+	* omp-general.h (enum oacc_loop_flags): Add OLF_TILE flag,
+	adjust OLF_DIM_BASE value.
+	(struct omp_for_data): Add tiling field.
+	* omp-low.c (scan_sharing_clauses): Allow OMP_CLAUSE_TILE.
+	(lower_oacc_head_mark): Add OLF_TILE as appropriate.  Ensure 2 levels
+	for auto loops.  Remove default auto determining, moved to
+	oacc_loop_fixed_partitions.
+	* omp-offload.c (struct oacc_loop): Change 'ifns' to vector of call
+	stmts, add e_mask field.
+	(oacc_dim_call): New function, abstracted out from oacc_thread_numbers.
+	(oacc_thread_numbers): Use oacc_dim_call.
+	(oacc_xform_tile): New.
+	(new_oacc_loop_raw): Initialize e_mask, adjust for ifns vector.
+	(finish_oacc_loop): Adjust for ifns vector.
+	(oacc_loop_discover_walk): Append loop abstraction sites to list,
+	add case for GOACC_TILE fns.
+	(oacc_loop_xform_loop): Delete.
+	(oacc_loop_process): Iterate over call list directly, and add
+	handling for GOACC_TILE fns.
+	(oacc_loop_fixed_partitions): Determine default auto, deal with TILE,
+	dump partitioning.
+	(oacc_loop_auto_partitions): Add outer_assign parm. Assign all but
+	vector partitioning to outer loops.  Assign 2 partitions to loops
+	when available. Add TILE handling.
+	(oacc_loop_partition): Adjust oacc_loop_auto_partitions call.
+	(execite_oacc_device_lower): Process GOACC_TILE fns, ignore unknown specs.
+	* tree-nested.c (convert_nonlocal_omp_clauses): Allow OMP_CLAUSE_TILE.
+	* tree.c (omp_clause_num_ops): Adjust TILE ops.
+	* tree.h (OMP_CLAUSE_TILE_ITERVAR, OMP_CLAUSE_TILE_COUNT): New.
+
 2017-02-09  Gerald Pfeifer  <gerald@pfeifer.com>
 
 	* configure.ac (ACX_BUGURL): Update.
diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog
index 1ab194a631077fe289ff51b8c50a3951c7201533..c05721df7eccdeb0576c2437d95afbe87871a149 100644
--- a/gcc/c/ChangeLog
+++ b/gcc/c/ChangeLog
@@ -1,3 +1,11 @@
+2016-02-09  Nathan Sidwell  <nathan@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* c-parser.c (c_parser_omp_clause_collapse): Disallow tile.
+	(c_parser_oacc_clause_tile): Disallow collapse. Fix parsing and
+	semantic checking.
+	* c-parser.c (c_parser_omp_for_loop): Accept tiling constructs.
+
 2017-02-07  Richard Biener  <rguenther@suse.de>
 
 	* gimple-parser.c (c_parser_gimple_expr_list): Simplify.
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 5c152ab28b0c623b1b4cf2f21b2fb61e64f3db87..6e83728b577a0b4759b1bc77134a3c45391c0b93 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -11023,6 +11023,7 @@ c_parser_omp_clause_collapse (c_parser *parser, tree list)
   location_t loc;
 
   check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse");
+  check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile");
 
   loc = c_parser_peek_token (parser)->location;
   if (c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
@@ -11933,10 +11934,11 @@ static tree
 c_parser_oacc_clause_tile (c_parser *parser, tree list)
 {
   tree c, expr = error_mark_node;
-  location_t loc, expr_loc;
+  location_t loc;
   tree tile = NULL_TREE;
 
   check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile");
+  check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse");
 
   loc = c_parser_peek_token (parser)->location;
   if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
@@ -11944,16 +11946,19 @@ c_parser_oacc_clause_tile (c_parser *parser, tree list)
 
   do
     {
+      if (tile && !c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
+	return list;
+
       if (c_parser_next_token_is (parser, CPP_MULT)
 	  && (c_parser_peek_2nd_token (parser)->type == CPP_COMMA
 	      || c_parser_peek_2nd_token (parser)->type == CPP_CLOSE_PAREN))
 	{
 	  c_parser_consume_token (parser);
-	  expr = integer_minus_one_node;
+	  expr = integer_zero_node;
 	}
       else
 	{
-	  expr_loc = c_parser_peek_token (parser)->location;
+	  location_t expr_loc = c_parser_peek_token (parser)->location;
 	  c_expr cexpr = c_parser_expr_no_commas (parser, NULL);
 	  cexpr = convert_lvalue_to_rvalue (expr_loc, cexpr, false, true);
 	  expr = cexpr.value;
@@ -11965,28 +11970,19 @@ c_parser_oacc_clause_tile (c_parser *parser, tree list)
 	      return list;
 	    }
 
-	  if (!INTEGRAL_TYPE_P (TREE_TYPE (expr)))
-	    {
-	      c_parser_error (parser, "%<tile%> value must be integral");
-	      return list;
-	    }
-
 	  expr = c_fully_fold (expr, false, NULL);
 
-	  /* Attempt to statically determine when expr isn't positive.  */
-	  c = fold_build2_loc (expr_loc, LE_EXPR, boolean_type_node, expr,
-			       build_int_cst (TREE_TYPE (expr), 0));
-	  protected_set_expr_location (c, expr_loc);
-	  if (c == boolean_true_node)
+	  if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
+	      || !tree_fits_shwi_p (expr)
+	      || tree_to_shwi (expr) <= 0)
 	    {
-	      warning_at (expr_loc, 0,"%<tile%> value must be positive");
-	      expr = integer_one_node;
+	      error_at (expr_loc, "%<tile%> argument needs positive"
+			" integral constant");
+	      expr = integer_zero_node;
 	    }
 	}
 
       tile = tree_cons (NULL_TREE, expr, tile);
-      if (c_parser_next_token_is (parser, CPP_COMMA))
-	c_parser_consume_token (parser);
     }
   while (c_parser_next_token_is_not (parser, CPP_CLOSE_PAREN));
 
@@ -14910,11 +14906,17 @@ c_parser_omp_for_loop (location_t loc, c_parser *parser, enum tree_code code,
   bool fail = false, open_brace_parsed = false;
   int i, collapse = 1, ordered = 0, count, nbraces = 0;
   location_t for_loc;
+  bool tiling = false;
   vec<tree, va_gc> *for_block = make_tree_vector ();
 
   for (cl = clauses; cl; cl = OMP_CLAUSE_CHAIN (cl))
     if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_COLLAPSE)
       collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (cl));
+    else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_TILE)
+      {
+	tiling = true;
+	collapse = list_length (OMP_CLAUSE_TILE_LIST (cl));
+      }
     else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_ORDERED
 	     && OMP_CLAUSE_ORDERED_EXPR (cl))
       {
@@ -14944,7 +14946,7 @@ c_parser_omp_for_loop (location_t loc, c_parser *parser, enum tree_code code,
 	  pc = &OMP_CLAUSE_CHAIN (*pc);
     }
 
-  gcc_assert (collapse >= 1 && ordered >= 0);
+  gcc_assert (tiling || (collapse >= 1 && ordered >= 0));
   count = ordered ? ordered : collapse;
 
   declv = make_tree_vec (count);
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog
index dc87561793c9369af6291965b9834ec60993a742..291dcddcde17af81633b14040630d01a96e8d2bc 100644
--- a/gcc/cp/ChangeLog
+++ b/gcc/cp/ChangeLog
@@ -1,3 +1,16 @@
+2016-02-09  Nathan Sidwell  <nathan@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* parser.c (cp_parser_oacc_clause_tile): Disallow collapse.  Fix
+	parsing.  Parse constant expression. Remove semantic checking.
+	(cp_parser_omp_clause_collapse): Disallow tile.
+	(cp_parser_omp_for_loop): Deal with tile clause.  Don't emit a parse
+	error about missing for after already emitting one.  Use more
+	conventional for idiom for unbounded loop.
+	* pt.c (tsubst_omp_clauses): Handle OMP_CLAUSE_TILE.
+	* semantics.c (finish_omp_clauses): Correct TILE semantic check.
+	(finish_omp_for): Deal with tile clause.
+
 2017-02-07  Nathan Sidwell  <nathan@acm.org>
 
 	* method.c (synthesized_method_base_walk): New.  Broken out of ...
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index d2df777f344d8d7987437c7b91cc351f1a7d9b04..41b08e1a7a30eedff870b7b5972f8dde23e51fae 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -31274,30 +31274,33 @@ cp_parser_oacc_clause_tile (cp_parser *parser, location_t clause_loc, tree list)
   tree c, expr = error_mark_node;
   tree tile = NULL_TREE;
 
+  /* Collapse and tile are mutually exclusive.  (The spec doesn't say
+     so, but the spec authors never considered such a case and have
+     differing opinions on what it might mean, including 'not
+     allowed'.)  */
   check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile", clause_loc);
+  check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse",
+			     clause_loc);
 
   if (!cp_parser_require (parser, CPP_OPEN_PAREN, RT_OPEN_PAREN))
     return list;
 
   do
     {
+      if (tile && !cp_parser_require (parser, CPP_COMMA, RT_COMMA))
+	return list;
+      
       if (cp_lexer_next_token_is (parser->lexer, CPP_MULT)
 	  && (cp_lexer_nth_token_is (parser->lexer, 2, CPP_COMMA)
 	      || cp_lexer_nth_token_is (parser->lexer, 2, CPP_CLOSE_PAREN)))
 	{
 	  cp_lexer_consume_token (parser->lexer);
-	  expr = integer_minus_one_node;
+	  expr = integer_zero_node;
 	}
       else
-	expr = cp_parser_assignment_expression (parser, NULL, false, false);
-
-      if (expr == error_mark_node)
-	return list;
+	expr = cp_parser_constant_expression (parser);
 
       tile = tree_cons (NULL_TREE, expr, tile);
-
-      if (cp_lexer_next_token_is (parser->lexer, CPP_COMMA))
-	cp_lexer_consume_token (parser->lexer);
     }
   while (cp_lexer_next_token_is_not (parser->lexer, CPP_CLOSE_PAREN));
 
@@ -31410,6 +31413,7 @@ cp_parser_omp_clause_collapse (cp_parser *parser, tree list, location_t location
     }
 
   check_no_duplicate_clause (list, OMP_CLAUSE_COLLAPSE, "collapse", location);
+  check_no_duplicate_clause (list, OMP_CLAUSE_TILE, "tile", location);
   c = build_omp_clause (loc, OMP_CLAUSE_COLLAPSE);
   OMP_CLAUSE_CHAIN (c) = list;
   OMP_CLAUSE_COLLAPSE_EXPR (c) = num;
@@ -34416,10 +34420,16 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses,
   int i, collapse = 1, ordered = 0, count, nbraces = 0;
   vec<tree, va_gc> *for_block = make_tree_vector ();
   auto_vec<tree, 4> orig_inits;
+  bool tiling = false;
 
   for (cl = clauses; cl; cl = OMP_CLAUSE_CHAIN (cl))
     if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_COLLAPSE)
       collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (cl));
+    else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_TILE)
+      {
+	tiling = true;
+	collapse = list_length (OMP_CLAUSE_TILE_LIST (cl));
+      }
     else if (OMP_CLAUSE_CODE (cl) == OMP_CLAUSE_ORDERED
 	     && OMP_CLAUSE_ORDERED_EXPR (cl))
       {
@@ -34449,7 +34459,7 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses,
 	  pc = &OMP_CLAUSE_CHAIN (*pc);
     }
 
-  gcc_assert (collapse >= 1 && ordered >= 0);
+  gcc_assert (tiling || (collapse >= 1 && ordered >= 0));
   count = ordered ? ordered : collapse;
 
   declv = make_tree_vec (count);
@@ -34468,13 +34478,15 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses,
       if (code != CILK_FOR
 	  && !cp_lexer_next_token_is_keyword (parser->lexer, RID_FOR))
 	{
-	  cp_parser_error (parser, "for statement expected");
+	  if (!collapse_err)
+	    cp_parser_error (parser, "for statement expected");
 	  return NULL;
 	}
       if (code == CILK_FOR
 	  && !cp_lexer_next_token_is_keyword (parser->lexer, RID_CILK_FOR))
 	{
-	  cp_parser_error (parser, "_Cilk_for statement expected");
+	  if (!collapse_err)
+	    cp_parser_error (parser, "_Cilk_for statement expected");
 	  return NULL;
 	}
       loc = cp_lexer_consume_token (parser->lexer)->location;
@@ -34634,7 +34646,7 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses,
 	 nested.  Hopefully the final version clarifies this.
 	 For now handle (multiple) {'s and empty statements.  */
       cp_parser_parse_tentatively (parser);
-      do
+      for (;;)
 	{
 	  if (cp_lexer_next_token_is_keyword (parser->lexer, RID_FOR))
 	    break;
@@ -34649,14 +34661,13 @@ cp_parser_omp_for_loop (cp_parser *parser, enum tree_code code, tree clauses,
 	  else
 	    {
 	      loc = cp_lexer_peek_token (parser->lexer)->location;
-	      error_at (loc, "not enough collapsed for loops");
+	      error_at (loc, "not enough for loops to collapse");
 	      collapse_err = true;
 	      cp_parser_abort_tentative_parse (parser);
 	      declv = NULL_TREE;
 	      break;
 	    }
 	}
-      while (1);
 
       if (declv)
 	{
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 6072432382d51afa304f7a791e57e5cd2098f761..8863c281ad7cac5d57fff8da3f02b64c2057bee9 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -15078,6 +15078,7 @@ tsubst_omp_clauses (tree clauses, enum c_omp_region_type ort,
 	    = tsubst_omp_clause_decl (OMP_CLAUSE_DECL (oc), args, complain,
 				      in_decl);
 	  break;
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_IF:
 	case OMP_CLAUSE_NUM_THREADS:
 	case OMP_CLAUSE_SCHEDULE:
@@ -15172,19 +15173,6 @@ tsubst_omp_clauses (tree clauses, enum c_omp_region_type ort,
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
 	  break;
-	case OMP_CLAUSE_TILE:
-	  {
-	    tree lnc, loc;
-	    for (lnc = OMP_CLAUSE_TILE_LIST (nc),
-		   loc = OMP_CLAUSE_TILE_LIST (oc);
-		 loc;
-		 loc = TREE_CHAIN (loc), lnc = TREE_CHAIN (lnc))
-	      {
-		TREE_VALUE (lnc) = tsubst_expr (TREE_VALUE (loc), args,
-						complain, in_decl, false);
-	      }
-	  }
-	  break;
 	default:
 	  gcc_unreachable ();
 	}
diff --git a/gcc/cp/semantics.c b/gcc/cp/semantics.c
index e4f2a6a8807931a693654c18946dc2be7623be41..e9fc4aa27977b61492e773d38ddee90e5cfb0b12 100644
--- a/gcc/cp/semantics.c
+++ b/gcc/cp/semantics.c
@@ -7099,7 +7099,8 @@ finish_omp_clauses (tree clauses, enum c_omp_region_type ort)
 	      else if (!type_dependent_expression_p (t)
 		       && !INTEGRAL_TYPE_P (TREE_TYPE (t)))
 		{
-		  error ("%<tile%> value must be integral");
+		  error_at (OMP_CLAUSE_LOCATION (c),
+			    "%<tile%> argument needs integral type");
 		  remove = true;
 		}
 	      else
@@ -7107,14 +7108,16 @@ finish_omp_clauses (tree clauses, enum c_omp_region_type ort)
 		  t = mark_rvalue_use (t);
 		  if (!processing_template_decl)
 		    {
+		      /* Zero is used to indicate '*', we permit you
+			 to get there via an ICE of value zero.  */
 		      t = maybe_constant_value (t);
-		      if (TREE_CODE (t) == INTEGER_CST
-			  && tree_int_cst_sgn (t) != 1
-			  && t != integer_minus_one_node)
+		      if (!tree_fits_shwi_p (t)
+			  || tree_to_shwi (t) < 0)
 			{
-			  warning_at (OMP_CLAUSE_LOCATION (c), 0,
-				      "%<tile%> value must be positive");
-			  t = integer_one_node;
+			  error_at (OMP_CLAUSE_LOCATION (c),
+				    "%<tile%> argument needs positive "
+				    "integral constant");
+			  remove = true;
 			}
 		    }
 		  t = fold_build_cleanup_point_expr (TREE_TYPE (t), t);
@@ -8013,11 +8016,19 @@ finish_omp_for (location_t locus, enum tree_code code, tree declv,
   gcc_assert (TREE_VEC_LENGTH (declv) == TREE_VEC_LENGTH (incrv));
   if (TREE_VEC_LENGTH (declv) > 1)
     {
-      tree c = omp_find_clause (clauses, OMP_CLAUSE_COLLAPSE);
+      tree c;
+
+      c = omp_find_clause (clauses, OMP_CLAUSE_TILE);
       if (c)
-	collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
-      if (collapse != TREE_VEC_LENGTH (declv))
-	ordered = TREE_VEC_LENGTH (declv);
+	collapse = list_length (OMP_CLAUSE_TILE_LIST (c));
+      else
+	{
+	  c = omp_find_clause (clauses, OMP_CLAUSE_COLLAPSE);
+	  if (c)
+	    collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
+	  if (collapse != TREE_VEC_LENGTH (declv))
+	    ordered = TREE_VEC_LENGTH (declv);
+	}
     }
   for (i = 0; i < TREE_VEC_LENGTH (declv); i++)
     {
diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog
index 0b3279667dd11e8384ca2bdccea00e1c5ee860de..3488f01e7cdaa54bebbe8e342ddd1e2410f0eda1 100644
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@@ -1,3 +1,12 @@
+2017-02-09  Cesar Philippidis  <cesar@codesourcery.com>
+	    Joseph Myers  <joseph@codesourcery.com>
+
+	* openmp.c (resolve_omp_clauses): Error on directives
+	containing both tile and collapse clauses.
+	(resolve_oacc_loop_blocks): Represent '*' tile arguments as zero.
+	* trans-openmp.c (gfc_trans_omp_do): Lower tiled loops like
+	collapsed loops.
+
 2017-02-07  Steven G. Kargl  <kargl@gcc.gnu.org>
 
 	* trans-types.c	(gfc_get_int_kind_from_width_isofortranen):  Choose
diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c
index d19ee9483300f1a7f5ba1d10e06ef0d011d3bac3..3ca23493251f281c31726219a9b96a7da44078de 100644
--- a/gcc/fortran/openmp.c
+++ b/gcc/fortran/openmp.c
@@ -4754,6 +4754,8 @@ resolve_omp_clauses (gfc_code *code, gfc_omp_clauses *omp_clauses,
     if (omp_clauses->wait_list)
       for (el = omp_clauses->wait_list; el; el = el->next)
 	resolve_scalar_int_expr (el->expr, "WAIT");
+  if (omp_clauses->collapse && omp_clauses->tile_list)
+    gfc_error ("Incompatible use of TILE and COLLAPSE at %L", &code->loc);
   if (omp_clauses->depend_source && code->op != EXEC_OMP_ORDERED)
     gfc_error ("SOURCE dependence type only allowed "
 	       "on ORDERED directive at %L", &code->loc);
@@ -5900,11 +5902,11 @@ resolve_oacc_loop_blocks (gfc_code *code)
 	  if (el->expr == NULL)
 	    {
 	      /* NULL expressions are used to represent '*' arguments.
-		 Convert those to a -1 expressions.  */
+		 Convert those to a 0 expressions.  */
 	      el->expr = gfc_get_constant_expr (BT_INTEGER,
 						gfc_default_integer_kind,
 						&code->loc);
-	      mpz_set_si (el->expr->value.integer, -1);
+	      mpz_set_si (el->expr->value.integer, 0);
 	    }
 	  else
 	    {
diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c
index 4f525feb5023377353f0abafc612675d035a3136..662036f514db105c685deacc1a4da19b88b037d9 100644
--- a/gcc/fortran/trans-openmp.c
+++ b/gcc/fortran/trans-openmp.c
@@ -3488,6 +3488,17 @@ gfc_trans_omp_do (gfc_code *code, gfc_exec_op op, stmtblock_t *pblock,
   dovar_init *di;
   unsigned ix;
   vec<tree, va_heap, vl_embed> *saved_doacross_steps = doacross_steps;
+  gfc_expr_list *tile = do_clauses ? do_clauses->tile_list : clauses->tile_list;
+
+  /* Both collapsed and tiled loops are lowered the same way.  In
+     OpenACC, those clauses are not compatible, so prioritize the tile
+     clause, if present.  */
+  if (tile)
+    {
+      collapse = 0;
+      for (gfc_expr_list *el = tile; el; el = el->next)
+	collapse++;
+    }
 
   doacross_steps = NULL;
   if (clauses->orderedc)
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index feb5fa0f9e93aa6c5cb47ec7e00b17ddc5949570..dd73fc258e59ea2e04e961f8967460c580768831 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -8340,20 +8340,11 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p,
 	    remove = true;
 	  break;
 
-	case OMP_CLAUSE_TILE:
-	  for (tree list = OMP_CLAUSE_TILE_LIST (c); !remove && list;
-	       list = TREE_CHAIN (list))
-	    {
-	      if (gimplify_expr (&TREE_VALUE (list), pre_p, NULL,
-				 is_gimple_val, fb_rvalue) == GS_ERROR)
-		remove = true;
-	    }
-	  break;
-
 	case OMP_CLAUSE_NOWAIT:
 	case OMP_CLAUSE_ORDERED:
 	case OMP_CLAUSE_UNTIED:
 	case OMP_CLAUSE_COLLAPSE:
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
 	case OMP_CLAUSE_INDEPENDENT:
@@ -9122,13 +9113,7 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
 	case OMP_CLAUSE_VECTOR:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
-	  break;
-
 	case OMP_CLAUSE_TILE:
-	  /* We're not yet making use of the information provided by OpenACC
-	     tile clauses.  Discard these here, to simplify later middle end
-	     processing.  */
-	  remove = true;
 	  break;
 
 	default:
@@ -9583,10 +9568,13 @@ gimplify_omp_for (tree *expr_p, gimple_seq *pre_p)
 						 (OMP_FOR_INIT (for_stmt))
 					       * 2);
     }
-  int collapse = 1;
+  int collapse = 1, tile = 0;
   c = omp_find_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_COLLAPSE);
   if (c)
     collapse = tree_to_shwi (OMP_CLAUSE_COLLAPSE_EXPR (c));
+  c = omp_find_clause (OMP_FOR_CLAUSES (for_stmt), OMP_CLAUSE_TILE);
+  if (c)
+    tile = list_length (OMP_CLAUSE_TILE_LIST (c));
   for (i = 0; i < TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)); i++)
     {
       t = TREE_VEC_ELT (OMP_FOR_INIT (for_stmt), i);
@@ -10000,7 +9988,7 @@ gimplify_omp_for (tree *expr_p, gimple_seq *pre_p)
 	  OMP_CLAUSE_LINEAR_STEP (c2) = OMP_CLAUSE_LINEAR_STEP (c);
 	}
 
-      if ((var != decl || collapse > 1) && orig_for_stmt == for_stmt)
+      if ((var != decl || collapse > 1 || tile) && orig_for_stmt == for_stmt)
 	{
 	  for (c = OMP_FOR_CLAUSES (for_stmt); c ; c = OMP_CLAUSE_CHAIN (c))
 	    if (((OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 0d61375462d030dd33abccd78cdc3d1b73a72b5e..1ccc803631af9f0a9fe39142818e678a3ad7e314 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -2479,6 +2479,14 @@ expand_GOACC_REDUCTION (internal_fn, gcall *)
   gcc_unreachable ();
 }
 
+/* This is expanded by oacc_device_lower pass.  */
+
+static void
+expand_GOACC_TILE (internal_fn, gcall *)
+{
+  gcc_unreachable ();
+}
+
 /* Set errno to EDOM.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index fd25a9522994b16957a9928b8a8569db252d17ca..9f682322c871cc4e6b26246589223ff1bdfd19d8 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -187,7 +187,7 @@ DEF_INTERNAL_FN (PHI, 0, NULL)
    dimension.  DIM_POS is pure (and not const) so that it isn't
    thought to clobber memory and can be gcse'd within a single
    parallel region, but not across FORK/JOIN boundaries.  They take a
-   single INTEGER_CST argument.  */
+   single INTEGER_CST argument.  This might be overly conservative.  */
 DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
 DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
 
@@ -197,6 +197,10 @@ DEF_INTERNAL_FN (GOACC_LOOP, ECF_PURE | ECF_NOTHROW, NULL)
 /* OpenACC reduction abstraction.  See internal-fn.h  for usage.  */
 DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL)
 
+/* Openacc tile abstraction. Describes the spans of the element loop.
+   GOACC_TILE (num-loops, loop-no, tile-arg, tile-mask, element-mask).  */
+DEF_INTERNAL_FN (GOACC_TILE, ECF_NOTHROW | ECF_LEAF, NULL)
+
 /* Set errno to EDOM, if GCC knows how to do that directly for the
    current target.  */
 DEF_INTERNAL_FN (SET_EDOM, ECF_LEAF | ECF_NOTHROW, NULL)
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 736573611a343b613bd7df8584c12bb82a8328ed..55e54e4dbf81792536e372cdcdee36ac89ab7493 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -1409,7 +1409,9 @@ struct oacc_collapse
 {
   tree base;  /* Base value.  */
   tree iters; /* Number of steps.  */
-  tree step;  /* step size.  */
+  tree step;  /* Step size.  */
+  tree tile;  /* Tile increment (if tiled).  */
+  tree outer; /* Tile iterator var. */
 };
 
 /* Helper for expand_oacc_for.  Determine collapsed loop information.
@@ -1419,15 +1421,20 @@ struct oacc_collapse
 static tree
 expand_oacc_collapse_init (const struct omp_for_data *fd,
 			   gimple_stmt_iterator *gsi,
-			   oacc_collapse *counts, tree bound_type)
+			   oacc_collapse *counts, tree bound_type,
+			   location_t loc)
 {
+  tree tiling = fd->tiling;
   tree total = build_int_cst (bound_type, 1);
   int ix;
 
   gcc_assert (integer_onep (fd->loop.step));
   gcc_assert (integer_zerop (fd->loop.n1));
 
-  for (ix = 0; ix != fd->collapse; ix++)
+  /* When tiling, the first operand of the tile clause applies to the
+     innermost loop, and we work outwards from there.  Seems
+     backwards, but whatever.  */
+  for (ix = fd->collapse; ix--;)
     {
       const omp_for_data_loop *loop = &fd->loops[ix];
 
@@ -1442,6 +1449,30 @@ expand_oacc_collapse_init (const struct omp_for_data *fd,
       if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
 	diff_type = signed_type_for (diff_type);
 
+      if (tiling)
+	{
+	  tree num = build_int_cst (integer_type_node, fd->collapse);
+	  tree loop_no = build_int_cst (integer_type_node, ix);
+	  tree tile = TREE_VALUE (tiling);
+	  gcall *call
+	    = gimple_build_call_internal (IFN_GOACC_TILE, 5, num, loop_no, tile,
+					  /* gwv-outer=*/integer_zero_node,
+					  /* gwv-inner=*/integer_zero_node);
+
+	  counts[ix].outer = create_tmp_var (iter_type, ".outer");
+	  counts[ix].tile = create_tmp_var (diff_type, ".tile");
+	  gimple_call_set_lhs (call, counts[ix].tile);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+	  tiling = TREE_CHAIN (tiling);
+	}
+      else
+	{
+	  counts[ix].tile = NULL;
+	  counts[ix].outer = loop->v;
+	}
+
       tree b = loop->n1;
       tree e = loop->n2;
       tree s = loop->step;
@@ -1495,13 +1526,14 @@ expand_oacc_collapse_init (const struct omp_for_data *fd,
   return total;
 }
 
-/* Emit initializers for collapsed loop members.  IVAR is the outer
+/* Emit initializers for collapsed loop members.  INNER is true if
+   this is for the element loop of a TILE.  IVAR is the outer
    loop iteration variable, from which collapsed loop iteration values
    are  calculated.  COUNTS array has been initialized by
    expand_oacc_collapse_inits.  */
 
 static void
-expand_oacc_collapse_vars (const struct omp_for_data *fd,
+expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
 			   gimple_stmt_iterator *gsi,
 			   const oacc_collapse *counts, tree ivar)
 {
@@ -1513,7 +1545,8 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd,
     {
       const omp_for_data_loop *loop = &fd->loops[ix];
       const oacc_collapse *collapse = &counts[ix];
-      tree iter_type = TREE_TYPE (loop->v);
+      tree v = inner ? loop->v : collapse->outer;
+      tree iter_type = TREE_TYPE (v);
       tree diff_type = TREE_TYPE (collapse->step);
       tree plus_type = iter_type;
       enum tree_code plus_code = PLUS_EXPR;
@@ -1525,24 +1558,25 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd,
 	  plus_type = sizetype;
 	}
 
-      expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar,
-			  fold_convert (ivar_type, collapse->iters));
+      expr = ivar;
+      if (ix)
+	{
+	  tree mod = fold_convert (ivar_type, collapse->iters);
+	  ivar = fold_build2 (TRUNC_DIV_EXPR, ivar_type, expr, mod);
+	  expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, expr, mod);
+	  ivar = force_gimple_operand_gsi (gsi, ivar, true, NULL_TREE,
+					   true, GSI_SAME_STMT);
+	}
+
       expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr),
 			  collapse->step);
-      expr = fold_build2 (plus_code, iter_type, collapse->base,
+      expr = fold_build2 (plus_code, iter_type,
+			  inner ? collapse->outer : collapse->base,
 			  fold_convert (plus_type, expr));
       expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE,
 				       true, GSI_SAME_STMT);
-      gassign *ass = gimple_build_assign (loop->v, expr);
+      gassign *ass = gimple_build_assign (v, expr);
       gsi_insert_before (gsi, ass, GSI_SAME_STMT);
-
-      if (ix)
-	{
-	  expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar,
-			      fold_convert (ivar_type, collapse->iters));
-	  ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE,
-					   true, GSI_SAME_STMT);
-	}
     }
 }
 
@@ -5230,7 +5264,8 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
    where LTGT is < or >.  We may have a specified chunking size, CHUNKING
    (constant 0 for no chunking) and we will have a GWV partitioning
    mask, specifying dimensions over which the loop is to be
-   partitioned (see note below).  We generate code that looks like:
+   partitioned (see note below).  We generate code that looks like
+   (this ignores tiling):
 
    <entry_bb> [incoming FALL->body, BRANCH->exit]
      typedef signedintify (typeof (V)) T;  // underlying signed integral type
@@ -5260,11 +5295,7 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
    <exit_bb> [incoming]
      V = B + ((range -/+ 1) / S +/- 1) * S [*]
 
-   [*] Needed if V live at end of loop
-
-   Note: CHUNKING & GWV mask are specified explicitly here.  This is a
-   transition, and will be specified by a more general mechanism shortly.
- */
+   [*] Needed if V live at end of loop.  */
 
 static void
 expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
@@ -5327,9 +5358,16 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
   tree step = create_tmp_var (diff_type, ".step");
   bool up = cond_code == LT_EXPR;
   tree dir = build_int_cst (diff_type, up ? +1 : -1);
-  bool chunking = !gimple_in_ssa_p (cfun);;
+  bool chunking = !gimple_in_ssa_p (cfun);
   bool negating;
 
+  /* Tiling vars.  */
+  tree tile_size = NULL_TREE;
+  tree element_s = NULL_TREE;
+  tree e_bound = NULL_TREE, e_offset = NULL_TREE, e_step = NULL_TREE;
+  basic_block elem_body_bb = NULL;
+  basic_block elem_cont_bb = NULL;
+
   /* SSA instances.  */
   tree offset_incr = NULL_TREE;
   tree offset_init = NULL_TREE;
@@ -5360,11 +5398,12 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
       gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
     }
 
-  if (fd->collapse > 1)
+  if (fd->collapse > 1 || fd->tiling)
     {
+      gcc_assert (!gimple_in_ssa_p (cfun) && up);
       counts = XALLOCAVEC (struct oacc_collapse, fd->collapse);
       tree total = expand_oacc_collapse_init (fd, &gsi, counts,
-					      TREE_TYPE (fd->loop.n2));
+					      TREE_TYPE (fd->loop.n2), loc);
 
       if (SSA_VAR_P (fd->loop.n2))
 	{
@@ -5373,7 +5412,6 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 	  ass = gimple_build_assign (fd->loop.n2, total);
 	  gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
 	}
-
     }
 
   tree b = fd->loop.n1;
@@ -5397,6 +5435,29 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
   expr = fold_convert (diff_type, chunk_size);
   chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
 					 NULL_TREE, true, GSI_SAME_STMT);
+
+  if (fd->tiling)
+    {
+      /* Determine the tile size and element step,
+	 modify the outer loop step size.  */
+      tile_size = create_tmp_var (diff_type, ".tile_size");
+      expr = build_int_cst (diff_type, 1);
+      for (int ix = 0; ix < fd->collapse; ix++)
+	expr = fold_build2 (MULT_EXPR, diff_type, counts[ix].tile, expr);
+      expr = force_gimple_operand_gsi (&gsi, expr, true,
+				       NULL_TREE, true, GSI_SAME_STMT);
+      ass = gimple_build_assign (tile_size, expr);
+      gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+      element_s = create_tmp_var (diff_type, ".element_s");
+      ass = gimple_build_assign (element_s, s);
+      gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+      expr = fold_build2 (MULT_EXPR, diff_type, s, tile_size);
+      s = force_gimple_operand_gsi (&gsi, expr, true,
+				    NULL_TREE, true, GSI_SAME_STMT);
+    }
+
   /* Determine the range, avoiding possible unsigned->signed overflow.  */
   negating = !up && TYPE_UNSIGNED (iter_type);
   expr = fold_build2 (MINUS_EXPR, plus_type,
@@ -5501,8 +5562,72 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 				       true, GSI_SAME_STMT);
       ass = gimple_build_assign (v, expr);
       gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
-      if (fd->collapse > 1)
-	expand_oacc_collapse_vars (fd, &gsi, counts, v);
+
+      if (fd->collapse > 1 || fd->tiling)
+	expand_oacc_collapse_vars (fd, false, &gsi, counts, v);
+
+      if (fd->tiling)
+	{
+	  /* Determine the range of the element loop -- usually simply
+	     the tile_size, but could be smaller if the final
+	     iteration of the outer loop is a partial tile.  */
+	  tree e_range = create_tmp_var (diff_type, ".e_range");
+
+	  expr = build2 (MIN_EXPR, diff_type,
+			 build2 (MINUS_EXPR, diff_type, bound, offset),
+			 build2 (MULT_EXPR, diff_type, tile_size,
+				 element_s));
+	  expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+					   true, GSI_SAME_STMT);
+	  ass = gimple_build_assign (e_range, expr);
+	  gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+	  /* Determine bound, offset & step of inner loop. */
+	  e_bound = create_tmp_var (diff_type, ".e_bound");
+	  e_offset = create_tmp_var (diff_type, ".e_offset");
+	  e_step = create_tmp_var (diff_type, ".e_step");
+
+	  /* Mark these as element loops.  */
+	  tree t, e_gwv = integer_minus_one_node;
+	  tree chunk = build_int_cst (diff_type, 0); /* Never chunked.  */
+
+	  t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
+	  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+					     element_s, chunk, e_gwv, chunk);
+	  gimple_call_set_lhs (call, e_offset);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+	  t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
+	  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+					     element_s, chunk, e_gwv, e_offset);
+	  gimple_call_set_lhs (call, e_bound);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+	  t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
+	  call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
+					     element_s, chunk, e_gwv);
+	  gimple_call_set_lhs (call, e_step);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+	  /* Add test and split block.  */
+	  expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+	  stmt = gimple_build_cond_empty (expr);
+	  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+	  split = split_block (body_bb, stmt);
+	  elem_body_bb = split->dest;
+	  if (cont_bb == body_bb)
+	    cont_bb = elem_body_bb;
+	  body_bb = split->src;
+
+	  split->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+
+	  /* Initialize the user's loop vars.  */
+	  gsi = gsi_start_bb (elem_body_bb);
+	  expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset);
+	}
     }
 
   /* Loop increment goes into cont_bb.  If this is not a loop, we
@@ -5516,10 +5641,34 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       loc = gimple_location (cont_stmt);
 
+      if (fd->tiling)
+	{
+	  /* Insert element loop increment and test.  */
+	  expr = build2 (PLUS_EXPR, diff_type, e_offset, e_step);
+	  expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+					   true, GSI_SAME_STMT);
+	  ass = gimple_build_assign (e_offset, expr);
+	  gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+	  expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+
+	  stmt = gimple_build_cond_empty (expr);
+	  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+	  split = split_block (cont_bb, stmt);
+	  elem_cont_bb = split->src;
+	  cont_bb = split->dest;
+
+	  split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+	  make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE);
+
+	  make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE);
+
+	  gsi = gsi_for_stmt (cont_stmt);
+	}
+
       /* Increment offset.  */
       if (gimple_in_ssa_p (cfun))
-	expr= build2 (plus_code, iter_type, offset,
-		      fold_convert (plus_type, step));
+	expr = build2 (plus_code, iter_type, offset,
+		       fold_convert (plus_type, step));
       else
 	expr = build2 (PLUS_EXPR, diff_type, offset, step);
       expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
@@ -5592,7 +5741,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 
   if (cont_bb)
     {
-      /* We now have one or two nested loops.  Update the loop
+      /* We now have one, two or three nested loops.  Update the loop
 	 structures.  */
       struct loop *parent = entry_bb->loop_father;
       struct loop *body = body_bb->loop_father;
@@ -5619,6 +5768,15 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 	  body_loop->header = body_bb;
 	  body_loop->latch = cont_bb;
 	  add_loop (body_loop, parent);
+
+	  if (fd->tiling)
+	    {
+	      /* Insert tiling's element loop.  */
+	      struct loop *inner_loop = alloc_loop ();
+	      inner_loop->header = elem_body_bb;
+	      inner_loop->latch = elem_cont_bb;
+	      add_loop (inner_loop, body_loop);
+	    }
 	}
     }
 }
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index 649dbf30621b8675fd6f3b8c00f4f121600d0361..3f9aec8d6a7dc9329ab823098e5ca954a9a3278f 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -133,13 +133,9 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 
   fd->for_stmt = for_stmt;
   fd->pre = NULL;
-  if (gimple_omp_for_collapse (for_stmt) > 1)
-    fd->loops = loops;
-  else
-    fd->loops = &fd->loop;
-
   fd->have_nowait = distribute || simd;
   fd->have_ordered = false;
+  fd->tiling = NULL_TREE;
   fd->collapse = 1;
   fd->ordered = 0;
   fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
@@ -184,9 +180,22 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	    collapse_count = &OMP_CLAUSE_COLLAPSE_COUNT (t);
 	  }
 	break;
+      case OMP_CLAUSE_TILE:
+	fd->tiling = OMP_CLAUSE_TILE_LIST (t);
+	fd->collapse = list_length (fd->tiling);
+	gcc_assert (fd->collapse);
+	collapse_iter = &OMP_CLAUSE_TILE_ITERVAR (t);
+	collapse_count = &OMP_CLAUSE_TILE_COUNT (t);
+	break;
       default:
 	break;
       }
+
+  if (fd->collapse > 1 || fd->tiling)
+    fd->loops = loops;
+  else
+    fd->loops = &fd->loop;
+
   if (fd->ordered && fd->collapse == 1 && loops != NULL)
     {
       fd->loops = loops;
@@ -205,7 +214,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
       fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
       gcc_assert (fd->chunk_size == NULL);
     }
-  gcc_assert (fd->collapse == 1 || collapse_iter != NULL);
+  gcc_assert ((fd->collapse == 1 && !fd->tiling) || collapse_iter != NULL);
   if (taskloop)
     fd->sched_kind = OMP_CLAUSE_SCHEDULE_RUNTIME;
   if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
@@ -223,7 +232,10 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
   int cnt = fd->ordered ? fd->ordered : fd->collapse;
   for (i = 0; i < cnt; i++)
     {
-      if (i == 0 && fd->collapse == 1 && (fd->ordered == 0 || loops == NULL))
+      if (i == 0
+	  && fd->collapse == 1
+	  && !fd->tiling
+	  && (fd->ordered == 0 || loops == NULL))
 	loop = &fd->loop;
       else if (loops != NULL)
 	loop = loops + i;
@@ -252,7 +264,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	  || (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC
 	      && !fd->have_ordered))
 	{
-	  if (fd->collapse == 1)
+	  if (fd->collapse == 1 && !fd->tiling)
 	    iter_type = TREE_TYPE (loop->v);
 	  else if (i == 0
 		   || TYPE_PRECISION (iter_type)
@@ -383,7 +395,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	*collapse_count = create_tmp_var (iter_type, ".count");
     }
 
-  if (fd->collapse > 1 || (fd->ordered && loops))
+  if (fd->collapse > 1 || fd->tiling || (fd->ordered && loops))
     {
       fd->loop.v = *collapse_iter;
       fd->loop.n1 = build_int_cst (TREE_TYPE (fd->loop.v), 0);
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index d89e8c179bf1e901dd713cc175aaebd3a941bdf5..3cf7fcec41fb76137c62ed465941e81810f140f1 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -31,9 +31,10 @@ enum oacc_loop_flags {
   OLF_AUTO	= 1u << 1,	/* Compiler chooses axes.  */
   OLF_INDEPENDENT = 1u << 2,	/* Iterations are known independent.  */
   OLF_GANG_STATIC = 1u << 3,	/* Gang partitioning is static (has op). */
-
+  OLF_TILE	= 1u << 4,	/* Tiled loop. */
+  
   /* Explicitly specified loop axes.  */
-  OLF_DIM_BASE = 4,
+  OLF_DIM_BASE = 5,
   OLF_DIM_GANG   = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
   OLF_DIM_WORKER = 1u << (OLF_DIM_BASE + GOMP_DIM_WORKER),
   OLF_DIM_VECTOR = 1u << (OLF_DIM_BASE + GOMP_DIM_VECTOR),
@@ -58,7 +59,8 @@ struct omp_for_data
   tree chunk_size;
   gomp_for *for_stmt;
   tree pre, iter_type;
-  int collapse;
+  tree tiling;  /* Tiling values (if non null).  */
+  int collapse;  /* Collapsed loops, 1 for a non-collapsed loop.  */
   int ordered;
   bool have_nowait, have_ordered, simd_schedule;
   unsigned char sched_modifiers;
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index ff0f4477cd774f0e9c2eb6df14ffb16e942d770c..35df02c70a4ad8dcdc81b25d459fc9b274451a33 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1330,6 +1330,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx,
 	case OMP_CLAUSE_INDEPENDENT:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__SIMT_:
 	  break;
 
@@ -1340,7 +1341,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx,
 	    install_var_local (decl, ctx);
 	  break;
 
-	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__CACHE_:
 	default:
 	  gcc_unreachable ();
@@ -1501,11 +1501,11 @@ scan_sharing_clauses (tree clauses, omp_context *ctx,
 	case OMP_CLAUSE_INDEPENDENT:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__GRIDDIM_:
 	case OMP_CLAUSE__SIMT_:
 	  break;
 
-	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__CACHE_:
 	default:
 	  gcc_unreachable ();
@@ -5610,6 +5610,10 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
 	  tag |= OLF_INDEPENDENT;
 	  break;
 
+	case OMP_CLAUSE_TILE:
+	  tag |= OLF_TILE;
+	  break;
+
 	default:
 	  continue;
 	}
@@ -5627,14 +5631,20 @@ lower_oacc_head_mark (location_t loc, tree ddvar, tree clauses,
   if (!tgt || is_oacc_parallel (tgt))
     tag |= OLF_INDEPENDENT;
 
-  /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO.  */
-  if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE)
-	       | OLF_SEQ)))
-      tag |= OLF_AUTO;
+  if (tag & OLF_TILE)
+    /* Tiling could use all 3 levels.  */ 
+    levels = 3;
+  else
+    {
+      /* A loop lacking SEQ, GANG, WORKER and/or VECTOR could be AUTO.
+	 Ensure at least one level, or 2 for possible auto
+	 partitioning */
+      bool maybe_auto = !(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
+				  << OLF_DIM_BASE) | OLF_SEQ));
 
-  /* Ensure at least one level.  */
-  if (!levels)
-    levels++;
+      if (levels < 1u + maybe_auto)
+	levels = 1u + maybe_auto;
+    }
 
   args.quick_push (build_int_cst (integer_type_node, levels));
   args.quick_push (build_int_cst (integer_type_node, tag));
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 6ff6bc2eeb9557c463894ed4b862ff03f79ec8c4..e4ce48cb8e5e1b7dcc7774a4983b0f2a03ec5666 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -67,9 +67,10 @@ struct oacc_loop
   tree routine;  /* Pseudo-loop enclosing a routine.  */
 
   unsigned mask;   /* Partitioning mask.  */
+  unsigned e_mask; /* Partitioning of element loops (when tiling).  */
   unsigned inner;  /* Partitioning of inner loops.  */
   unsigned flags;  /* Partitioning flags.  */
-  unsigned ifns;   /* Contained loop abstraction functions.  */
+  vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
   tree chunk_size; /* Chunk size.  */
   gcall *head_end; /* Final marker of head sequence.  */
 };
@@ -217,6 +218,23 @@ omp_finish_file (void)
     }
 }
 
+/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
+   axis DIM.  Return a tmp var holding the result.  */
+
+static tree
+oacc_dim_call (bool pos, int dim, gimple_seq *seq)
+{
+  tree arg = build_int_cst (unsigned_type_node, dim);
+  tree size = create_tmp_var (integer_type_node);
+  enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
+  gimple *call = gimple_build_call_internal (fn, 1, arg);
+
+  gimple_call_set_lhs (call, size);
+  gimple_seq_add_stmt (seq, call);
+
+  return size;
+}
+
 /* Find the number of threads (POS = false), or thread number (POS =
    true) for an OpenACC region partitioned as MASK.  Setup code
    required for the calculation is added to SEQ.  */
@@ -231,29 +249,17 @@ oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
     if (GOMP_DIM_MASK (ix) & mask)
       {
-	tree arg = build_int_cst (unsigned_type_node, ix);
-
 	if (res)
 	  {
 	    /* We had an outer index, so scale that by the size of
 	       this dimension.  */
-	    tree n = create_tmp_var (integer_type_node);
-	    gimple *call
-	      = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg);
-
-	    gimple_call_set_lhs (call, n);
-	    gimple_seq_add_stmt (seq, call);
+	    tree n = oacc_dim_call (false, ix, seq);
 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
 	  }
 	if (pos)
 	  {
 	    /* Determine index in this dimension.  */
-	    tree id = create_tmp_var (integer_type_node);
-	    gimple *call = gimple_build_call_internal
-	      (IFN_GOACC_DIM_POS, 1, arg);
-
-	    gimple_call_set_lhs (call, id);
-	    gimple_seq_add_stmt (seq, call);
+	    tree id = oacc_dim_call (true, ix, seq);
 	    if (res)
 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
 	    else
@@ -452,6 +458,85 @@ oacc_xform_loop (gcall *call)
   gsi_replace_with_seq (&gsi, seq, true);
 }
 
+/* Transform a GOACC_TILE call.  Determines the element loop span for
+   the specified loop of the nest.  This is 1 if we're not tiling.
+   
+   GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
+
+static void
+oacc_xform_tile (gcall *call)
+{
+  gimple_stmt_iterator gsi = gsi_for_stmt (call);
+  unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
+  /* Inner loops have higher loop_nos.  */
+  unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
+  tree tile_size = gimple_call_arg (call, 2);
+  unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
+  tree lhs = gimple_call_lhs (call);
+  tree type = TREE_TYPE (lhs);
+  gimple_seq seq = NULL;
+  tree span = build_int_cst (type, 1);
+
+  gcc_assert (!(e_mask
+		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
+  push_gimplify_context (!seen_error ());
+
+#ifndef ACCEL_COMPILER
+  /* Partitioning disabled on host compilers.  */
+  e_mask = 0;
+#endif
+  if (!e_mask)
+    /* Not paritioning.  */
+    span = integer_one_node;
+  else if (!integer_zerop (tile_size))
+    /* User explicitly specified size.  */
+    span = tile_size;
+  else
+    {
+      /* Pick a size based on the paritioning of the element loop and
+	 the number of loop nests.  */
+      tree first_size = NULL_TREE;
+      tree second_size = NULL_TREE;
+
+      if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
+      if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
+
+      if (!first_size)
+	{
+	  first_size = second_size;
+	  second_size = NULL_TREE;
+	}
+
+      if (loop_no + 1 == collapse)
+	{
+	  span = first_size;
+	  if (!loop_no && second_size)
+	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
+				span, second_size);
+	}
+      else if (loop_no + 2 == collapse)
+	span = second_size;
+      else
+	span = NULL_TREE;
+
+      if (!span)
+	/* There's no obvious element size for this loop.  Options
+	   are 1, first_size or some non-unity constant (32 is my
+	   favourite).   We should gather some statistics.  */
+	span = first_size;
+    }
+
+  span = fold_convert (type, span);
+  gimplify_assign (lhs, span, &seq);
+
+  pop_gimplify_context (NULL);
+
+  gsi_replace_with_seq (&gsi, seq, true);
+}
+
 /* Default partitioned and minimum partitioned dimensions.  */
 
 static int oacc_default_dims[GOMP_DIM_MAX];
@@ -610,8 +695,7 @@ new_oacc_loop_raw (oacc_loop *parent, location_t loc)
   memset (loop->tails, 0, sizeof (loop->tails));
   loop->routine = NULL_TREE;
 
-  loop->mask = loop->flags = loop->inner = 0;
-  loop->ifns = 0;
+  loop->mask = loop->e_mask = loop->flags = loop->inner = 0;
   loop->chunk_size = 0;
   loop->head_end = NULL;
 
@@ -674,7 +758,7 @@ static oacc_loop *
 finish_oacc_loop (oacc_loop *loop)
 {
   /* If the loop has been collapsed, don't partition it.  */
-  if (!loop->ifns)
+  if (loop->ifns.is_empty ())
     loop->mask = loop->flags = 0;
   return loop->parent;
 }
@@ -810,9 +894,10 @@ oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
 	  break;
 
 	case IFN_GOACC_LOOP:
-	  /* Count the goacc loop abstraction fns, to determine if the
-	     loop was collapsed already.  */
-	  loop->ifns++;
+	case IFN_GOACC_TILE:
+	  /* Record the abstraction function, so we can manipulate it
+	     later.  */
+	  loop->ifns.safe_push (call);
 	  break;
 
 	case IFN_UNIQUE:
@@ -947,51 +1032,6 @@ oacc_loop_xform_head_tail (gcall *from, int level)
     }
 }
 
-/* Transform the IFN_GOACC_LOOP internal functions by providing the
-   determined partitioning mask and chunking argument.  END_MARKER
-   points at the end IFN_HEAD_TAIL call intgroducing the loop.  IFNS
-   is the number of IFN_GOACC_LOOP calls for the loop.  MASK_ARG is
-   the replacement partitioning mask and CHUNK_ARG is the replacement
-   chunking arg.  */
-
-static void
-oacc_loop_xform_loop (gcall *end_marker, unsigned ifns,
-		      tree mask_arg, tree chunk_arg)
-{
-  gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
-
-  gcc_checking_assert (ifns);
-  for (;;)
-    {
-      for (; !gsi_end_p (gsi); gsi_next (&gsi))
-	{
-	  gimple *stmt = gsi_stmt (gsi);
-
-	  if (!is_gimple_call (stmt))
-	    continue;
-
-	  gcall *call = as_a <gcall *> (stmt);
-
-	  if (!gimple_call_internal_p (call))
-	    continue;
-
-	  if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
-	    continue;
-
-	  *gimple_call_arg_ptr (call, 5) = mask_arg;
-	  *gimple_call_arg_ptr (call, 4) = chunk_arg;
-	  ifns--;
-	  if (!ifns)
-	    return;
-	}
-
-      /* The LOOP_BOUND ifn could be in the single successor
-	 block.  */
-      basic_block bb = single_succ (gsi_bb (gsi));
-      gsi = gsi_start_bb (bb);
-    }
-}
-
 /* Process the discovered OpenACC loops, setting the correct
    partitioning level etc.  */
 
@@ -1004,13 +1044,34 @@ oacc_loop_process (oacc_loop *loop)
   if (loop->mask && !loop->routine)
     {
       int ix;
-      unsigned mask = loop->mask;
-      unsigned dim = GOMP_DIM_GANG;
-      tree mask_arg = build_int_cst (unsigned_type_node, mask);
+      tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
+      tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
       tree chunk_arg = loop->chunk_size;
+      gcall *call;
+      
+      for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
+	switch (gimple_call_internal_fn (call))
+	  {
+	  case IFN_GOACC_LOOP:
+	    {
+	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
+	      gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
+	      if (!is_e)
+		gimple_call_set_arg (call, 4, chunk_arg);
+	    }
+	    break;
 
-      oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg);
+	  case IFN_GOACC_TILE:
+	    gimple_call_set_arg (call, 3, mask_arg);
+	    gimple_call_set_arg (call, 4, e_mask_arg);
+	    break;
 
+	  default:
+	    gcc_unreachable ();
+	  }
+
+      unsigned dim = GOMP_DIM_GANG;
+      unsigned mask = loop->mask | loop->e_mask;
       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
 	{
 	  while (!(GOMP_DIM_MASK (dim) & mask))
@@ -1050,10 +1111,16 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
     {
       bool auto_par = (loop->flags & OLF_AUTO) != 0;
       bool seq_par = (loop->flags & OLF_SEQ) != 0;
-
+      bool tiling = (loop->flags & OLF_TILE) != 0;
+      
       this_mask = ((loop->flags >> OLF_DIM_BASE)
 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
 
+      /* Apply auto partitioning if this is a non-partitioned regular
+	 loop, or (no more than) single axis tiled loop.  */
+      bool maybe_auto
+	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
+
       if ((this_mask != 0) + auto_par + seq_par > 1)
 	{
 	  if (noisy)
@@ -1062,7 +1129,7 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 		      ? "%<seq%> overrides other OpenACC loop specifiers"
 		      : "%<auto%> conflicts with other OpenACC loop "
 		      "specifiers");
-	  auto_par = false;
+	  maybe_auto = false;
 	  loop->flags &= ~OLF_AUTO;
 	  if (seq_par)
 	    {
@@ -1071,15 +1138,19 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 	      this_mask = 0;
 	    }
 	}
-      if (auto_par && (loop->flags & OLF_INDEPENDENT))
-	mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+
+      if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
+	{
+	  loop->flags |= OLF_AUTO;
+	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+	}
     }
 
   if (this_mask & outer_mask)
     {
       const oacc_loop *outer;
       for (outer = loop->parent; outer; outer = outer->parent)
-	if (outer->mask & this_mask)
+	if ((outer->mask | outer->e_mask) & this_mask)
 	  break;
 
       if (noisy)
@@ -1125,13 +1196,33 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 	}
     }
 
-  loop->mask = this_mask;
   mask_all |= this_mask;
 
+  if (loop->flags & OLF_TILE)
+    {
+      /* When tiling, vector goes to the element loop, and failing
+	 that we put worker there.  The std doesn't contemplate
+	 specifying all three.  We choose to put worker and vector on
+	 the element loops in that case.  */
+      unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+      if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
+	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
+      loop->e_mask = this_e_mask;
+      this_mask ^= this_e_mask;
+    }
+
+  loop->mask = this_mask;
+
+  if (dump_file)
+    fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
+	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+	     loop->mask, loop->e_mask);
+
   if (loop->child)
     {
-      loop->inner = oacc_loop_fixed_partitions (loop->child,
-						outer_mask | this_mask);
+      unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
+      loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
       mask_all |= loop->inner;
     }
 
@@ -1143,14 +1234,17 @@ oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 
 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
    OUTER_MASK is the partitioning this loop is contained within.
+   OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
    Return the cumulative partitioning used by this loop, siblings and
    children.  */
 
 static unsigned
-oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
+			   bool outer_assign)
 {
   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
   bool noisy = true;
+  bool tiling = loop->flags & OLF_TILE;
 
 #ifdef ACCEL_COMPILER
   /* When device_type is supported, we want the device compiler to be
@@ -1158,29 +1252,50 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
   noisy = false;
 #endif
 
-  if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+  if (assign && (!outer_assign | loop->inner))
     {
-      /* Allocate the outermost loop at the outermost available
-	 level.  */
-      unsigned this_mask = outer_mask + 1;
+      /* Allocate outermost and non-innermost loops at the outermost
+	 non-innermost available level.  */
+      unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
+
+      /* Find the first outermost available partition. */
+      while (this_mask <= outer_mask)
+	this_mask <<= 1;
+      
+      /* Grab two axes if tiling, and we've not assigned anything  */
+      if (tiling && !(loop->mask | loop->e_mask))
+	this_mask |= this_mask << 1;
+
+      /* Prohibit the innermost partitioning at the moment.  */
+      this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
+
+      /* Don't use any dimension explicitly claimed by an inner loop. */
+      this_mask &= ~loop->inner;
+
+      if (tiling && !loop->e_mask)
+	{
+	  /* If we got two axes, allocate the inner one to the element
+	     loop.  */
+	  loop->e_mask = this_mask & (this_mask << 1);
+	  this_mask ^= loop->e_mask;
+	}
 
-      if (!(this_mask & loop->inner))
-	loop->mask = this_mask;
+      loop->mask |= this_mask;
     }
 
   if (loop->child)
     {
-      unsigned child_mask = outer_mask | loop->mask;
-
-      if (loop->mask || assign)
-	child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
-
-      loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
+      unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
+      loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
+					       outer_assign | assign);
     }
 
-  if (assign && !loop->mask)
+  if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
     {
-      /* Allocate the loop at the innermost available level.  */
+      /* Allocate the loop at the innermost available level.  Note
+	 that we do this even if we already assigned this loop the
+	 outermost available level above.  That way we'll partition
+	 this along 2 axes, if they are available.  */
       unsigned this_mask = 0;
 
       /* Determine the outermost partitioning used within this loop.  */
@@ -1193,24 +1308,44 @@ oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
       /* And avoid picking one use by an outer loop.  */
       this_mask &= ~outer_mask;
 
-      if (!this_mask && noisy)
-	warning_at (loop->loc, 0,
-		    "insufficient partitioning available to parallelize loop");
+      /* If tiling and we failed completely above, grab the next one
+	 too.  Making sure it doesn't hit an outer loop.  */
+      if (tiling)
+	{
+	  this_mask &= ~(loop->e_mask | loop->mask);
+	  unsigned tile_mask = ((this_mask >> 1)
+				& ~(outer_mask | loop->e_mask | loop->mask));
+
+	  if (tile_mask || loop->mask)
+	    {
+	      loop->e_mask |= this_mask;
+	      this_mask = tile_mask;
+	    }
+	  if (!loop->e_mask && noisy)
+	    warning_at (loop->loc, 0,
+			"insufficient partitioning available"
+			" to parallelize element loop");
+	}
 
-      loop->mask = this_mask;
+      loop->mask |= this_mask;
+      if (!loop->mask && noisy)
+	warning_at (loop->loc, 0,
+		    "insufficient partitioning available"
+		    " to parallelize%s loop", tiling ? " tile" : "");
     }
 
   if (assign && dump_file)
-    fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+    fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
-	     loop->mask);
+	     loop->mask, loop->e_mask);
 
   unsigned inner_mask = 0;
 
   if (loop->sibling)
-    inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+    inner_mask |= oacc_loop_auto_partitions (loop->sibling,
+					     outer_mask, outer_assign);
 
-  inner_mask |= loop->inner | loop->mask;
+  inner_mask |= loop->inner | loop->mask | loop->e_mask;
 
   return inner_mask;
 }
@@ -1226,7 +1361,7 @@ oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
     {
       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
-      mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
+      mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
     }
   return mask_all;
 }
@@ -1376,6 +1511,11 @@ execute_oacc_device_lower ()
 	  {
 	  default: break;
 
+	  case IFN_GOACC_TILE:
+	    oacc_xform_tile (call);
+	    rescan = true;
+	    break;
+	    
 	  case IFN_GOACC_LOOP:
 	    oacc_xform_loop (call);
 	    rescan = true;
@@ -1403,7 +1543,7 @@ execute_oacc_device_lower ()
 	      switch (kind)
 		{
 		default:
-		  gcc_unreachable ();
+		  break;
 
 		case IFN_UNIQUE_OACC_FORK:
 		case IFN_UNIQUE_OACC_JOIN:
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index aeba293c41d2e35cd245bdefeadaebf3198f76c4..95c0bfc5c7e08e1c337939f3efa8e9727cdf6e32 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,21 @@
+2017-02-09  Nathan Sidwell  <nathan@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+	    Joseph Myers  <joseph@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* c-c++-common/goacc/combined-directives.c: Remove xfail.
+	* c-c++-common/goacc/loop-auto-1.c: Adjust and add additional case.
+	* c-c++-common/goacc/loop-auto-2.c: New.
+	* c-c++-common/goacc/tile.c: Include stdbool, fix expected errors.
+	* c-c++-common/goacc/tile-2.c: New.
+	* g++.dg/goacc/template.C: Test tile subst.  Adjust erroneous uses.
+	* g++.dg/goacc/tile-1.C: New, check tile subst.
+	* gcc.dg/goacc/loop-processing-1.c: Adjust dg-final pattern.
+	* gfortran.dg/goacc/combined-directives.f90: Remove xfail.
+	* gfortran.dg/goacc/tile-1.f90: New test.
+	* gfortran.dg/goacc/tile-2.f90: New test.
+	* gfortran.dg/goacc/tile-lowering.f95: New test.
+
 2017-02-09  Richard Biener  <rguenther@suse.de>
 
 	PR tree-optimization/69823
diff --git a/gcc/testsuite/c-c++-common/goacc/combined-directives.c b/gcc/testsuite/c-c++-common/goacc/combined-directives.c
index 3fa800d7bbe050c0090b4acd1ca0bf31c40ff0c6..c2a3c57b48b83432c1bf000d279a2d09a8f2f57a 100644
--- a/gcc/testsuite/c-c++-common/goacc/combined-directives.c
+++ b/gcc/testsuite/c-c++-common/goacc/combined-directives.c
@@ -111,7 +111,6 @@ test ()
 // { dg-final { scan-tree-dump-times "acc loop vector" 2 "gimple" } }
 // { dg-final { scan-tree-dump-times "acc loop seq" 2 "gimple" } }
 // { dg-final { scan-tree-dump-times "acc loop auto" 2 "gimple" } }
-// XFAILed: OpenACC tile clauses are discarded during gimplification.
-// { dg-final { scan-tree-dump-times "acc loop tile.2, 3" 2 "gimple" { xfail *-*-* } } }
+// { dg-final { scan-tree-dump-times "acc loop tile.2, 3" 2 "gimple" } }
 // { dg-final { scan-tree-dump-times "acc loop independent private.i" 2 "gimple" } }
 // { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } }
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
index 33d53409fe3a678961b3a38a4eb04bb8f206057e..124befc400222e1c9ceca1ca669abca418842c13 100644
--- a/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-1.c
@@ -74,6 +74,21 @@ void Foo ()
 	    for (int kx = 0; kx < 10; kx++) {}
 	  }
       }
+
+#pragma acc loop auto
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop auto
+	for (int jx = 0; jx < 10; jx++)
+	  {
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	    for (int kx = 0; kx < 10; kx++)
+	      {
+#pragma acc loop auto
+		for (int lx = 0; lx < 10; lx++) {}
+	      }
+	  }
+      }
   }
 }
 
@@ -214,10 +229,10 @@ void Vector (void)
 #pragma acc loop auto
     for (int ix = 0; ix < 10; ix++) {}
 
-#pragma acc loop auto
+#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
     for (int ix = 0; ix < 10; ix++)
       {
-#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+#pragma acc loop auto
 	for (int jx = 0; jx < 10; jx++) {}
       }
 }
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..af3f0bddf2cbb8e4f189bd23e127b8008d84ba09
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-2.c
@@ -0,0 +1,107 @@
+
+// Tile parititioning
+
+void Ok ()
+{
+#pragma acc parallel num_gangs (10) num_workers(32) vector_length(32)
+  {
+    
+#pragma acc loop tile(*) gang vector
+    for (int ix = 0; ix < 10; ix++)
+      {
+      }
+
+#pragma acc loop tile(*)
+    for (int ix = 0; ix < 10; ix++)
+      {
+      }
+
+#pragma acc loop tile(*) gang
+    for (int ix = 0; ix < 10; ix++)
+      {
+	#pragma acc loop vector
+	for (int jx = 0; jx < 10; jx++)
+	  ;
+      }
+
+#pragma acc loop tile(*)
+    for (int ix = 0; ix < 10; ix++)
+      {
+	#pragma acc loop vector
+	for (int jx = 0; jx < 10; jx++)
+	  ;
+      }
+
+#pragma acc loop gang
+    for (int jx = 0; jx < 10; jx++)
+      {
+#pragma acc loop tile(*) vector
+	for (int ix = 0; ix < 10; ix++)
+	  {
+	  }
+
+#pragma acc loop tile(*)
+	for (int ix = 0; ix < 10; ix++)
+	  {
+	  }
+      }
+
+#pragma acc loop tile(*) worker
+    for (int ix = 0; ix < 10; ix++)
+      {
+	#pragma acc loop vector
+	for (int jx = 0; jx < 10; jx++)
+	  ;
+      }
+  }
+}
+
+void Bad ()
+{
+#pragma acc parallel num_gangs (10) num_workers(32) vector_length(32)
+  {
+    
+#pragma acc loop tile(*) gang vector /* { dg-message "containing loop" } */
+    for (int ix = 0; ix < 10; ix++)
+      {
+#pragma acc loop vector /* { dg-error "uses same" } */
+	for (int jx = 0; jx < 10; jx++)
+	  ;
+      }
+
+#pragma acc loop tile(*) gang vector
+    for (int ix = 0; ix < 10; ix++)
+      {
+	#pragma acc loop auto /* { dg-warning "insufficient partitioning" } */
+	for (int jx = 0; jx < 10; jx++)
+	  ;
+      }
+
+#pragma acc loop tile(*) auto /* { dg-warning "insufficient partitioning" } */
+    for (int ix = 0; ix < 10; ix++)
+      {
+	#pragma acc loop worker
+	for (int jx = 0; jx < 10; jx++)
+	  ;
+      }
+
+#pragma acc loop worker /* { dg-message "containing loop" } */
+    for (int jx = 0; jx < 10; jx++)
+      {
+#pragma acc loop tile(*) gang vector /* { dg-error "incorrectly nested" } */
+	for (int ix = 0; ix < 10; ix++)
+	  {
+	  }
+
+#pragma acc loop tile(*) vector /* { dg-warning "insufficient partitioning" } */
+	for (int ix = 0; ix < 10; ix++)
+	  {
+	  }
+
+#pragma acc loop tile(*) /* { dg-warning "insufficient partitioning" } */
+	for (int ix = 0; ix < 10; ix++)
+	  {
+	  }
+      }
+  }
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/tile-2.c b/gcc/testsuite/c-c++-common/goacc/tile-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8b240d225b940302c46ff1a544d83dc7876cd9f
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/tile-2.c
@@ -0,0 +1,21 @@
+int main ()
+{
+#pragma acc parallel
+  {
+#pragma acc loop tile (*,*)
+    for (int ix = 0; ix < 30; ix++)
+      ; /* { dg-error "not enough" } */
+
+#pragma acc loop tile (*,*)
+    for (int ix = 0; ix < 30; ix++)
+      for (int jx = 0; jx < ix; jx++) /* { dg-error "condition expression" } */
+	;
+    
+#pragma acc loop tile (*)
+    for (int ix = 0; ix < 30; ix++)
+      for (int jx = 0; jx < ix; jx++) /* OK */
+	;
+    
+  }
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/tile.c b/gcc/testsuite/c-c++-common/goacc/tile.c
index 8e70e718e1f860260505e45a556139ea13bab821..f10535a69e5b42f4e0eb16603a0d7a6eea9c76a2 100644
--- a/gcc/testsuite/c-c++-common/goacc/tile.c
+++ b/gcc/testsuite/c-c++-common/goacc/tile.c
@@ -1,7 +1,9 @@
+#include <stdbool.h>
+
 int
 main ()
 {
-  int i, *a, b;
+  int i, j, k, *a, b;
 
 #pragma acc parallel loop tile (10)
   for (i = 0; i < 100; i++)
@@ -13,11 +15,14 @@ main ()
 
 #pragma acc parallel loop tile (10, *)
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      ;
 
-#pragma acc parallel loop tile (10, *, i)
+#pragma acc parallel loop tile (10, *, i) // { dg-error "" }
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      for (k = 0; k < 100; k++)
+	;
 
 #pragma acc parallel loop tile // { dg-error "expected '\\\('" }
   for (i = 0; i < 100; i++)
@@ -35,37 +40,44 @@ main ()
   for (i = 0; i < 100; i++)
     ;
 
-#pragma acc parallel loop tile (1.1) // { dg-error "'tile' value must be integral" }
+#pragma acc parallel loop tile (1.1) // { dg-error "'tile' argument needs" }
   for (i = 0; i < 100; i++)
     ;
 
-#pragma acc parallel loop tile (-3) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile (-3) // { dg-error "'tile' argument needs" }
   for (i = 0; i < 100; i++)
     ;
 
-#pragma acc parallel loop tile (10,-3) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile (10,-3) // { dg-error "'tile' argument needs" }
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      ;
 
-#pragma acc parallel loop tile (-100,10,5) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile (-100,10,5) // { dg-error "'tile' argument needs" }
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      for (k = 0; k < 100; k++)
+	;
 
-#pragma acc parallel loop tile (1,2.0,true) // { dg-error "" }
+#pragma acc parallel loop tile (1,true)
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      ;
 
-#pragma acc parallel loop tile (*a, 1)
+#pragma acc parallel loop tile (*a, 1) // { dg-error "" }
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      ;
 
-#pragma acc parallel loop tile (1, *a, b)
+#pragma acc parallel loop tile (1, b) // { dg-error "" }
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      ;
 
-#pragma acc parallel loop tile (b, 1, *a)
+#pragma acc parallel loop tile (b, 1) // { dg-error "" }
   for (i = 0; i < 100; i++)
-    ;
+    for (j = 0; j < 100; j++)
+      ;
 
   return 0;
 }
@@ -73,7 +85,7 @@ main ()
 
 void par (void)
 {
-  int i, j;
+  int i, j, k;
 
 #pragma acc parallel
   {
@@ -95,22 +107,22 @@ void par (void)
 	for (j = 1; j < 10; j++)
 	  { }
       }
-#pragma acc loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc loop tile(-2)  // { dg-error "'tile' argument needs" }
     for (i = 1; i < 10; i++)
       { }
-#pragma acc loop tile(i)
+#pragma acc loop tile(i)  // { dg-error "" }
     for (i = 1; i < 10; i++)
       { }
 #pragma acc loop tile(2, 2, 1)
     for (i = 1; i < 3; i++)
       {
 	for (j = 4; j < 6; j++)
-	  { }
+	  for (k = 0; k< 100; k++);
       } 
 #pragma acc loop tile(2, 2)
     for (i = 1; i < 5; i+=2)
       {
-	for (j = i + 1; j < 7; j+=i)
+	for (j = i + 1; j < 7; j+=i) // { dg-error "initializer expression" }
 	  { }
       }
 #pragma acc loop vector tile(*) 
@@ -156,24 +168,21 @@ void p3 (void)
       for (j = 1; j < 10; j++)
 	{ }
     }
-#pragma acc parallel loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc parallel loop tile(-2)   // { dg-error "'tile' argument needs" }
   for (i = 1; i < 10; i++)
     { }
-#pragma acc parallel loop tile(i)
+#pragma acc parallel loop tile(i)   // { dg-error "" }
   for (i = 1; i < 10; i++)
     { }
 #pragma acc parallel loop tile(2, 2, 1)
   for (i = 1; i < 3; i++)
-    {
-      for (j = 4; j < 6; j++)
-        { }
-    }    
+    for (j = 4; j < 6; j++)
+      for (int k = 1 ; k < 2; k++)
+	;
 #pragma acc parallel loop tile(2, 2)
   for (i = 1; i < 5; i+=2)
-    {
-      for (j = i + 1; j < 7; j++)
-        { }
-    }
+    for (j = i + 1; j < 7; j++) // { dg-error "initializer expression" }
+      { }
 #pragma acc parallel loop vector tile(*) 
   for (i = 0; i < 10; i++)
     { }
@@ -227,22 +236,23 @@ kern (void)
 #pragma acc loop tile(*, 1) 
     for (i = 0; i < 10; i++)
       {
-	for (j = 0; j < 10; i++)
+	for (j = 0; j < 10; i++) /* { dg-error "increment expression" } */
 	  { }
       }
-#pragma acc loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc loop tile(-2) // { dg-error "'tile' argument needs" }
     for (i = 0; i < 10; i++)
       { }
-#pragma acc loop tile(i)
+#pragma acc loop tile(i) // { dg-error "" }
     for (i = 0; i < 10; i++)
       { }
 #pragma acc loop tile(2, 2, 1)
     for (i = 2; i < 4; i++)
-      for (i = 4; i < 6; i++)
+      for (j = 4; j < 6; j++)
+	for (int k = 4; k < 6; k++)
 	{ }
 #pragma acc loop tile(2, 2)
     for (i = 1; i < 5; i+=2)
-      for (j = i+1; j < 7; i++)
+      for (j = i+1; j < 7; j++) /* { dg-error "initializer expression" } */
 	{ }
 #pragma acc loop vector tile(*) 
     for (i = 0; i < 10; i++)
@@ -288,22 +298,21 @@ void k3 (void)
       for (j = 1; j < 10; j++)
 	{ }
     }
-#pragma acc kernels loop tile(-2) // { dg-warning "'tile' value must be positive" }
+#pragma acc kernels loop tile(-2) // { dg-error "'tile' argument needs" }
   for (i = 1; i < 10; i++)
     { }
-#pragma acc kernels loop tile(i)
+#pragma acc kernels loop tile(i) // { dg-error "" }
   for (i = 1; i < 10; i++)
     { }
 #pragma acc kernels loop tile(2, 2, 1)
   for (i = 1; i < 3; i++)
-    {
-      for (j = 4; j < 6; j++)
-	{ }
-    }    
+    for (j = 4; j < 6; j++)
+      for (int k = 1; k < 7; k++)
+	;
 #pragma acc kernels loop tile(2, 2)
   for (i = 1; i < 5; i++)
     {
-      for (j = i + 1; j < 7; j += i)
+      for (j = i + 1; j < 7; j += i) /* { dg-error "initializer expression" } */
 	{ }
     }
 #pragma acc kernels loop vector tile(*) 
diff --git a/gcc/testsuite/g++.dg/goacc/template.C b/gcc/testsuite/g++.dg/goacc/template.C
index f139dc25b58b1051688b14fc9b74ad498392b254..74f40d8922e736ce42411849e3f754bff8f959d5 100644
--- a/gcc/testsuite/g++.dg/goacc/template.C
+++ b/gcc/testsuite/g++.dg/goacc/template.C
@@ -5,7 +5,7 @@ accDouble(int val)
   return val * 2;
 }
 
-template<typename T> T
+template<typename T, int I> T
 oacc_parallel_copy (T a)
 {
   T b = 0;
@@ -36,7 +36,7 @@ oacc_parallel_copy (T a)
       for (int j = 0; j < 5; j++)
 	b = a;
 
-#pragma acc loop auto tile (a, 3)
+#pragma acc loop auto tile (I, 3)
     for (int i = 0; i < a; i++)
       for (int j = 0; j < 5; j++)
 	b = a;
@@ -135,7 +135,7 @@ oacc_kernels_copy (T a)
 int
 main ()
 {
-  int b = oacc_parallel_copy<int> (5);
+  int b = oacc_parallel_copy<int, 4> (5);
   int c = oacc_kernels_copy<int> (5);
 
   return b + c;
diff --git a/gcc/testsuite/g++.dg/goacc/tile-1.C b/gcc/testsuite/g++.dg/goacc/tile-1.C
new file mode 100644
index 0000000000000000000000000000000000000000..27c53835d3631fb990b7385285e50ebb67b264d7
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/tile-1.C
@@ -0,0 +1,16 @@
+/*  of tile erroneously clobbered the template, resulting
+    in missing errors and other fun.  */
+
+template <int I>
+void Foo ()
+{
+#pragma acc parallel loop tile(I) // { dg-error "" }
+  for (int ix = 0; ix < 10; ix++)
+    ;
+}
+
+int main ()
+{
+  Foo<1> ();  // OK
+  Foo<-1> (); // error
+}
diff --git a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
index 619576a17eed7655345d0263b3ac2a30cb5d1a0f..07f56a25329cf1619c3cdc37975761c69bb550e7 100644
--- a/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
+++ b/gcc/testsuite/gcc.dg/goacc/loop-processing-1.c
@@ -15,4 +15,4 @@ void vector_1 (int *ary, int size)
   }
 }
 
-/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 14\(1\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 20\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 20\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(4\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 6\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);} "oaccdevlow" } } */
+/* { dg-final { scan-tree-dump {OpenACC loops.*Loop 0\(0\).*Loop 24\(1\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 1, 36\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 0\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 0\);.*Loop 6\(6\).*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*Head-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, 0, 2, 6\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 1\);.*Head-1:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_HEAD_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_FORK, \.data_dep\.[0-9_]+, 2\);.*Tail-1:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 2\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 2\);.*Tail-0:.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_TAIL_MARK, \.data_dep\.[0-9_]+, 1\);.*\.data_dep\.[0-9_]+ = UNIQUE \(OACC_JOIN, \.data_dep\.[0-9_]+, 1\);} "oaccdevlow" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90 b/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90
index abb5e6b6c3d1646dfe6cb0ac70a9165c5b10a263..42a447ad06bff8238048deb124cc020ff13c8447 100644
--- a/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90
+++ b/gcc/testsuite/gfortran.dg/goacc/combined-directives.f90
@@ -143,8 +143,7 @@ end subroutine test
 ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. vector" 2 "gimple" } }
 ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. seq" 2 "gimple" } }
 ! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. auto" 2 "gimple" } }
-! XFAILed: OpenACC tile clauses are discarded during gimplification.
-! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" { xfail *-*-* } } }
+! { dg-final { scan-tree-dump-times "acc loop private.i. private.j. tile.2, 3" 2 "gimple" } }
 ! { dg-final { scan-tree-dump-times "acc loop private.i. independent" 2 "gimple" } }
 ! { dg-final { scan-tree-dump-times "private.z" 2 "gimple" } }
 ! { dg-final { scan-tree-dump-times "omp target oacc_\[^ \]+ map.force_tofrom:y" 2 "gimple" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-1.f90 b/gcc/testsuite/gfortran.dg/goacc/tile-1.f90
new file mode 100644
index 0000000000000000000000000000000000000000..3dbabda0342ea6e9719dfb0b5bc8ee0aa0f3ef82
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/tile-1.f90
@@ -0,0 +1,339 @@
+subroutine parloop
+  integer, parameter :: n = 100
+  integer i, j, k, a
+
+  !$acc parallel loop tile(10)
+  do i = 1, n
+  end do
+  
+  !$acc parallel loop tile(*)
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile(10, *)
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+ 
+  !$acc parallel loop tile(10, *, i) ! { dg-error "" }
+  do i = 1, n
+     do j = 1, n
+        do k = 1, n
+        end do
+     end do
+  end do 
+
+  !$acc parallel loop tile ! { dg-error "Unclassifiable" }
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile() ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile(,1) ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile(,,) ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile(1.1) ! { dg-error "requires a scalar INTEGER" }
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile(-3) ! { dg-warning "must be positive" }
+  do i = 1, n
+  end do
+
+  !$acc parallel loop tile(10, -3) ! { dg-warning "must be positive" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc parallel loop tile(-100, 10, 5) ! { dg-warning "must be positive" }
+  do i = 1, n
+     do j = 1, n
+        do k = 1, n
+        end do
+     end do
+  end do 
+
+  !$acc parallel loop tile(10, .true.) ! { dg-error "requires a scalar" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc parallel loop tile(1, a) ! { dg-error "constant expression" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc parallel loop tile(a, 1) ! { dg-error "constant expression" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc parallel loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+end subroutine parloop
+
+subroutine par
+  integer, parameter :: n = 100
+  integer i, j, k
+
+  !$acc parallel
+  !$acc loop tile ! { dg-error "Unclassifiable" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile() ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile(1)
+  do i = 1, n
+  end do
+
+  !$acc loop tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop tile(2)
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc loop tile(-2) ! { dg-warning "must be positive" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile(i) ! { dg-error "constant expression" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile(2, 2, 1)
+  do i = 1, n
+     do j = 1, n
+        do k = 1, n
+        end do
+     end do
+  end do 
+
+  !$acc parallel loop tile(2, 2)
+  do i = 1, n
+     do j = i+1, n, j ! { dg-error "rectangular iteration space" }
+     end do
+  end do
+
+  !$acc loop vector tile(*)
+  do i = 1, n
+  end do
+  
+  !$acc loop worker tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop gang tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop vector gang tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop vector worker tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop gang worker tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+  !$acc end parallel
+end subroutine par
+
+subroutine kern
+  integer, parameter :: n = 100
+  integer i, j, k
+
+  !$acc kernels
+  !$acc loop tile  ! { dg-error "Unclassifiable" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile() ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile(1)
+  do i = 1, n
+  end do
+
+  !$acc loop tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop tile(2)
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc loop tile(-2) ! { dg-warning "must be positive" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile(i) ! { dg-error "constant expression" }
+  do i = 1, n
+  end do
+
+  !$acc loop tile(2, 2, 1)
+  do i = 1, n
+     do j = 1, n
+        do k = 1, n
+        end do
+     end do
+  end do 
+
+  !$acc parallel loop tile(2, 2)
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc loop vector tile(*)
+  do i = 1, n
+  end do
+  
+  !$acc loop worker tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop gang tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop vector gang tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop vector worker tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop gang worker tile(*)
+  do i = 1, n
+  end do
+
+  !$acc loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+  !$acc end kernels
+end subroutine kern
+
+subroutine kernsloop
+  integer, parameter :: n = 100
+  integer i, j, k, a
+
+  !$acc kernels loop tile(10)
+  do i = 1, n
+  end do
+  
+  !$acc kernels loop tile(*)
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile(10, *)
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+ 
+  !$acc kernels loop tile(10, *, i) ! { dg-error "" }
+  do i = 1, n
+     do j = 1, n
+        do k = 1, n
+        end do
+     end do
+  end do 
+
+  !$acc kernels loop tile ! { dg-error "Unclassifiable" }
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile() ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile(,1) ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile(,,) ! { dg-error "Syntax error" }
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile(1.1) ! { dg-error "requires a scalar INTEGER" }
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile(-3) ! { dg-warning "must be positive" }
+  do i = 1, n
+  end do
+
+  !$acc kernels loop tile(10, -3) ! { dg-warning "must be positive" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc kernels loop tile(-100, 10, 5) ! { dg-warning "must be positive" }
+  do i = 1, n
+     do j = 1, n
+        do k = 1, n
+        end do
+     end do
+  end do 
+
+  !$acc kernels loop tile(10, .true.) ! { dg-error "requires a scalar" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc kernels loop tile(1, a) ! { dg-error "constant expression" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc kernels loop tile(a, 1) ! { dg-error "constant expression" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+
+  !$acc kernels loop tile(2, 3) collapse (2) ! { dg-error "Incompatible use" }
+  do i = 1, n
+     do j = 1, n
+     end do
+  end do
+end subroutine kernsloop
diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-2.f90 b/gcc/testsuite/gfortran.dg/goacc/tile-2.f90
new file mode 100644
index 0000000000000000000000000000000000000000..c56754380cc9ab17cc1bbb462a058a61a6aa258e
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/tile-2.f90
@@ -0,0 +1,21 @@
+subroutine par
+  integer ix, jx
+
+  !$acc parallel
+  !$acc loop tile (*,*) ! { dg-error "not enough DO loops for tiled" }
+  do ix = 1, 30
+  end do
+
+  !$acc loop tile (*,*)
+  do ix = 1, 30
+     do jx = 1, ix ! { dg-error "tiled loops don.t form rectangular" }
+     end do
+  end do
+
+  !$acc loop tile (*)
+  do ix = 1, 30
+     do jx = 1, ix
+     end do
+  end do
+  !$acc end parallel
+end subroutine par
diff --git a/gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95 b/gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95
new file mode 100644
index 0000000000000000000000000000000000000000..1cb8b9cc512f4a6f8c393116e00dc8d5adddf454
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/tile-lowering.f95
@@ -0,0 +1,292 @@
+! { dg-do compile }
+! { dg-additional-options "-fdump-tree-original" }
+
+subroutine par
+  integer i, j, k
+
+  !$acc parallel
+  !$acc loop tile (1)
+  do i = 1, 10
+  end do
+
+  !$acc loop tile (*)
+  do i = 1, 10
+  end do
+
+  !$acc loop tile (1,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (*,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (1,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (*,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (1,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc loop tile (*,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc loop tile (1,*,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc loop tile (1,2,*)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+  !$acc end parallel
+end subroutine par
+
+subroutine kerns
+  integer i, j, k
+
+  !$acc kernels
+  !$acc loop tile (1)
+  do i = 1, 10
+  end do
+
+  !$acc loop tile (*)
+  do i = 1, 10
+  end do
+
+  !$acc loop tile (1,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (*,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (1,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (*,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc loop tile (1,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc loop tile (*,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc loop tile (1,*,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc loop tile (1,2,*)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+  !$acc end kernels
+end subroutine kerns
+
+subroutine parloop
+  integer i, j, k
+
+  !$acc parallel loop tile (1)
+  do i = 1, 10
+  end do
+
+  !$acc parallel loop tile (*)
+  do i = 1, 10
+  end do
+
+  !$acc parallel loop tile (1,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc parallel loop tile (*,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc parallel loop tile (1,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc parallel loop tile (*,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc parallel loop tile (1,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc parallel loop tile (*,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc parallel loop tile (1,*,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc parallel loop tile (1,2,*)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+end subroutine parloop
+
+subroutine kernloop
+  integer i, j, k
+
+  !$acc kernels loop tile (1)
+  do i = 1, 10
+  end do
+
+  !$acc kernels loop tile (*)
+  do i = 1, 10
+  end do
+
+  !$acc kernels loop tile (1,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc kernels loop tile (*,2)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc kernels loop tile (1,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc kernels loop tile (*,*)
+  do i = 1, 10
+     do j = 1, 10
+     end do
+  end do
+
+  !$acc kernels loop tile (1,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc kernels loop tile (*,2,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc kernels loop tile (1,*,3)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+
+  !$acc kernels loop tile (1,2,*)
+  do i = 1, 10
+     do j = 1, 10
+        do k = 1, 10
+        end do
+     end do
+  end do
+end subroutine kernloop
+
+
+! { dg-final { scan-tree-dump-times "tile\\(1\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 2\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0, 2\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0, 0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 2, 3\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(0, 2, 3\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 0, 3\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "tile\\(1, 2, 0\\)" 4 "original" } }
+! { dg-final { scan-tree-dump-times "for \\(" 88 "original" } }
+! { dg-final { scan-tree-dump-times "while \\(" 0 "original" } }
diff --git a/gcc/tree-nested.c b/gcc/tree-nested.c
index c005e7c735dc93f137102583ceb9f9246823bc46..4a25025ef8f13bc5e52035062fa7e6dbdb7a9695 100644
--- a/gcc/tree-nested.c
+++ b/gcc/tree-nested.c
@@ -1274,6 +1274,7 @@ convert_nonlocal_omp_clauses (tree *pclauses, struct walk_stmt_info *wi)
 	case OMP_CLAUSE_DEFAULT:
 	case OMP_CLAUSE_COPYIN:
 	case OMP_CLAUSE_COLLAPSE:
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_UNTIED:
 	case OMP_CLAUSE_MERGEABLE:
 	case OMP_CLAUSE_PROC_BIND:
@@ -1286,8 +1287,6 @@ convert_nonlocal_omp_clauses (tree *pclauses, struct walk_stmt_info *wi)
 	case OMP_CLAUSE_AUTO:
 	  break;
 
-	  /* OpenACC tile clauses are discarded during gimplification.  */
-	case OMP_CLAUSE_TILE:
 	  /* The following clause belongs to the OpenACC cache directive, which
 	     is discarded during gimplification.  */
 	case OMP_CLAUSE__CACHE_:
@@ -1982,6 +1981,7 @@ convert_local_omp_clauses (tree *pclauses, struct walk_stmt_info *wi)
 	case OMP_CLAUSE_DEFAULT:
 	case OMP_CLAUSE_COPYIN:
 	case OMP_CLAUSE_COLLAPSE:
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE_UNTIED:
 	case OMP_CLAUSE_MERGEABLE:
 	case OMP_CLAUSE_PROC_BIND:
@@ -1994,8 +1994,6 @@ convert_local_omp_clauses (tree *pclauses, struct walk_stmt_info *wi)
 	case OMP_CLAUSE_AUTO:
 	  break;
 
-	  /* OpenACC tile clauses are discarded during gimplification.  */
-	case OMP_CLAUSE_TILE:
 	  /* The following clause belongs to the OpenACC cache directive, which
 	     is discarded during gimplification.  */
 	case OMP_CLAUSE__CACHE_:
diff --git a/gcc/tree.c b/gcc/tree.c
index 804ab5ed58a95a40c09fb7229df50fc1e51e6d5a..3e63415e673eadeca286bfff9786e729e2e83373 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -328,7 +328,7 @@ unsigned const char omp_clause_num_ops[] =
   1, /* OMP_CLAUSE_NUM_GANGS  */
   1, /* OMP_CLAUSE_NUM_WORKERS  */
   1, /* OMP_CLAUSE_VECTOR_LENGTH  */
-  1, /* OMP_CLAUSE_TILE  */
+  3, /* OMP_CLAUSE_TILE  */
   2, /* OMP_CLAUSE__GRIDDIM_  */
 };
 
diff --git a/gcc/tree.h b/gcc/tree.h
index f63a678216e411086e5c6da1aba7430eb5204bef..3b12509e7df7eb5e51d72eab7e15250c08d3569b 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -1654,6 +1654,10 @@ extern void protected_set_expr_location (tree, location_t);
 
 #define OMP_CLAUSE_TILE_LIST(NODE) \
   OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 0)
+#define OMP_CLAUSE_TILE_ITERVAR(NODE) \
+  OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 1)
+#define OMP_CLAUSE_TILE_COUNT(NODE) \
+  OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_TILE), 2)
 
 #define OMP_CLAUSE__GRIDDIM__DIMENSION(NODE) \
   (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE__GRIDDIM_)\
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 72dc8dacf2845204f5498cb181b05004fc4053c0..132f9d732e9841fa7b2191c85207cc81239f4e21 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,14 @@
+2017-02-09  Nathan Sidwell  <nathan@codesourcery.com>
+	    Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/tile-1.c: New.
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Adjust and
+	add additional case.
+	* testsuite/libgomp.oacc-c-c++-common/vprop.c: XFAIL under
+	"openacc_nvidia_accel_selected".
+	* libgomp.oacc-fortran/nested-function-1.f90 (test2):
+	Add num_workers(8) clause.
+
 2017-02-08  John David Anglin  <danglin@gcc.gnu.org>
 
 	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c: Skip on
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
index 3ca9388d405181c191319ccbd8099710c2151437..863b6b38c34ba028797558e2626fadaeb0cd45b1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -112,7 +112,7 @@ int vector_1 (int *ary, int size)
 	ary[ix] = place ();
   }
 
-  return check (ary, size, 0, 0, 1);
+  return check (ary, size, 0, 1, 1);
 }
 
 int vector_2 (int *ary, int size)
@@ -196,10 +196,24 @@ int gang_3 (int *ary, int size)
 	ary[ix + jx * 64] = place ();
   }
 
+  return check (ary, size, 1, 1, 1);
+}
+
+int gang_4 (int *ary, int size)
+{
+  clear (ary, size);
+  
+#pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop auto
+    for (int jx = 0; jx <  size; jx++)
+      ary[jx] = place ();
+  }
+
   return check (ary, size, 1, 0, 1);
 }
 
-#define N (32*32*32)
+#define N (32*32*32*2)
 int main ()
 {
   int ondev = 0;
@@ -227,6 +241,8 @@ int main ()
     return 1;
   if (gang_3 (ary,  N))
     return 1;
+  if (gang_4 (ary,  N))
+    return 1;
 
   return 0;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..8dcb956c59fb3257ae6150a41e2952ce207f93e7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
@@ -0,0 +1,281 @@
+/* This code uses nvptx inline assembly guarded with acc_on_device, which is
+   not optimized away at -O0, and then confuses the target assembler.
+   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+/* { dg-additional-options "-fopenacc-dim=32" } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+static int check (const int *ary, int size, int gp, int wp, int vp)
+{
+  int exit = 0;
+  int ix;
+  int gangs[32], workers[32], vectors[32];
+
+  for (ix = 0; ix < 32; ix++)
+    gangs[ix] = workers[ix] = vectors[ix] = 0;
+  
+  for (ix = 0; ix < size; ix++)
+    {
+      vectors[ary[ix] & 0xff]++;
+      workers[(ary[ix] >> 8) & 0xff]++;
+      gangs[(ary[ix] >> 16) & 0xff]++;
+    }
+
+  for (ix = 0; ix < 32; ix++)
+    {
+      if (gp)
+	{
+	  int expect = gangs[0];
+	  if (gangs[ix] != expect)
+	    {
+	      exit = 1;
+	      printf ("gang %d not used %d times\n", ix, expect);
+	    }
+	}
+      else if (ix && gangs[ix])
+	{
+	  exit = 1;
+	  printf ("gang %d unexpectedly used\n", ix);
+	}
+
+      if (wp)
+	{
+	  int expect = workers[0];
+	  if (workers[ix] != expect)
+	    {
+	      exit = 1;
+	      printf ("worker %d not used %d times\n", ix, expect);
+	    }
+	}
+      else if (ix && workers[ix])
+	{
+	  exit = 1;
+	  printf ("worker %d unexpectedly used\n", ix);
+	}
+
+      if (vp)
+	{
+	  int expect = vectors[0];
+	  if (vectors[ix] != expect)
+	    {
+	      exit = 1;
+	      printf ("vector %d not used %d times\n", ix, expect);
+	    }
+	}
+      else if (ix && vectors[ix])
+	{
+	  exit = 1;
+	  printf ("vector %d unexpectedly used\n", ix);
+	}
+      
+    }
+  return exit;
+}
+
+#pragma acc routine seq
+static int __attribute__((noinline)) place ()
+{
+  int r = 0;
+
+  if (acc_on_device (acc_device_nvidia))
+    {
+      int g = 0, w = 0, v = 0;
+
+      __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g));
+      __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w));
+      __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v));
+      r = (g << 16) | (w << 8) | v;
+    }
+  return r;
+}
+
+static void clear (int *ary, int size)
+{
+  int ix;
+
+  for (ix = 0; ix < size; ix++)
+    ary[ix] = -1;
+}
+
+int gang_vector_1 (int *ary, int size)
+{
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(128) gang vector
+    for (int jx = 0; jx < size; jx++)
+      ary[jx] = place ();
+  }
+
+  return check (ary, size, 1, 0, 1);
+}
+
+int gang_vector_2a (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+  
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(64, 64) gang vector
+    for (int jx = 0; jx < size / 256; jx++)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx * 256 + ix] = place ();
+  }
+
+  return check (ary, size, 1, 0, 1);
+}
+
+int gang_vector_2b (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+  
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_gangs (32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(64, 64) gang vector
+    for (int jx = 0; jx < size; jx += 256)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx + ix] = place ();
+  }
+
+  return check (ary, size, 1, 0, 1);
+}
+
+int worker_vector_2a (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(64, 64) worker vector
+    for (int jx = 0; jx < size / 256; jx++)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx * 256 + ix] = place ();
+  }
+
+  return check (ary, size, 0, 1, 1);
+}
+
+int worker_vector_2b (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(64, 64) worker vector
+    for (int jx = 0; jx < size; jx += 256)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx + ix] = place ();
+  }
+
+  return check (ary, size, 0, 1, 1);
+}
+
+int gang_worker_vector_2a (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(32, 32)
+    for (int jx = 0; jx < size / 256; jx++)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx * 256 + ix] = place ();
+  }
+
+  return check (ary, size, 1, 1, 1);
+}
+
+int gang_worker_vector_2b (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(32, 32)
+    for (int jx = 0; jx < size; jx += 256)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx + ix] = place ();
+  }
+
+  return check (ary, size, 1, 1, 1);
+}
+
+int gang_worker_vector_star_2a (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(*, *)
+    for (int jx = 0; jx < size / 256; jx++)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx * 256 + ix] = place ();
+  }
+
+  return check (ary, size, 1, 1, 1);
+}
+
+int gang_worker_vector_star_2b (int *ary, int size)
+{
+  if (size % 256)
+    return 1;
+
+  clear (ary, size);
+#pragma acc parallel vector_length(32) num_workers (32) num_gangs(32) copy (ary[0:size]) firstprivate (size)
+  {
+#pragma acc loop tile(*, *)
+    for (int jx = 0; jx < size; jx +=256)
+      for (int ix = 0; ix < 256; ix++)
+	ary[jx + ix] = place ();
+  }
+
+  return check (ary, size, 1, 1, 1);
+}
+
+#define N (32*32*32*8)
+int main ()
+{
+  int ondev = 0;
+
+#pragma acc parallel copy(ondev)
+  {
+    ondev = acc_on_device (acc_device_not_host);
+  }
+  if (!ondev)
+    return 0;
+  
+  int ary[N];
+  if (gang_vector_1 (ary, N))
+    return 1;
+  if (gang_vector_2a (ary, N))
+    return 1;
+  if (worker_vector_2a (ary, N))
+    return 1;
+  if (gang_worker_vector_2a (ary, N))
+    return 1;
+  if (gang_worker_vector_star_2a (ary, N))
+    return 1;
+  if (gang_vector_2b (ary, N))
+    return 1;
+  if (worker_vector_2b (ary, N))
+    return 1;
+  if (gang_worker_vector_2b (ary, N))
+    return 1;
+  if (gang_worker_vector_star_2b (ary, N))
+    return 1;
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c
index 17b9568a16cf219f273cebf831dcc450f265e18e..c2bce8286d43bb07d5582765fc4dcb955d4626f8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vprop.c
@@ -1,3 +1,6 @@
+/* { dg-do run } */
+/* { dg-xfail-run-if "PR78266" { openacc_nvidia_accel_selected } { "*" } { "" } } */
+
 #include <assert.h>
 
 #define test(type)				\
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90
index fdbca4481f852c841b898b3ae996094f47f37f64..c4af1992a05d56664f5783bb2df8d026e49c9ecc 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/nested-function-1.f90
@@ -33,7 +33,7 @@ firstdo: do i = 1, 3
   subroutine test2
     integer :: a(3,3,3), k, kk, kkk, l, ll, lll
     a = 0
-    !$acc parallel
+    !$acc parallel num_workers(8)
     ! Use "gang(static:1)" here and below to effectively turn gang-redundant
     ! execution mode into something like gang-single.
     !$acc loop gang(static:1) collapse(1)